From cd99c874b23264e5187d5ef5eb442cb8e3cfe7b2 Mon Sep 17 00:00:00 2001
From: ellnor01 <51320439+ellnor01@users.noreply.github.com>
Date: Fri, 18 Dec 2020 07:50:30 +0000
Subject: [PATCH 001/158] Reduce number of compilations in buffer suite (#1082)

* Reduce number of compilations in buffer suite

Extracts program and kernel compilation from mem_flags loop
as they were being recompiled unnecessarily.

Fixes #1020

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Remove misplaced frees in buffer tests

Contributes #1020

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>
---
 test_conformance/buffers/test_buffer_fill.cpp | 215 +++++++-----------
 test_conformance/buffers/test_buffer_map.cpp  |  50 ++--
 test_conformance/buffers/test_buffer_read.cpp | 167 +++++---------
 .../buffers/test_buffer_write.cpp             | 137 +++++------
 4 files changed, 216 insertions(+), 353 deletions(-)

diff --git a/test_conformance/buffers/test_buffer_fill.cpp b/test_conformance/buffers/test_buffer_fill.cpp
index 5c1dd48e57..2a12bd8c35 100644
--- a/test_conformance/buffers/test_buffer_fill.cpp
+++ b/test_conformance/buffers/test_buffer_fill.cpp
@@ -562,11 +562,11 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
                      int loops, void *inptr[5], void *hostptr[5], void *pattern[5], size_t offset_elements, size_t fill_elements,
                      const char *kernelCode[], const char *kernelName[], int (*fn)(void *,void *,int) )
 {
-    cl_mem      buffers[10];
+    clMemWrapper buffers[10];
     void        *outptr[5];
-    cl_program  program[5];
-    cl_kernel   kernel[5];
-    cl_event    event[2];
+    clProgramWrapper program[5];
+    clKernelWrapper kernel[5];
+    clEventWrapper event[2];
     size_t      ptrSizes[5];
     size_t      global_work_size[3];
     int         err;
@@ -584,12 +584,22 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
     ptrSizes[3] = ptrSizes[2] << 1;
     ptrSizes[4] = ptrSizes[3] << 1;
 
-    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-        log_info("Testing with cl_mem_flags: %s\n", flag_set_names[src_flag_id]);
+    loops = (loops < 5 ? loops : 5);
+    for (i = 0; i < loops; i++)
+    {
+        ii = i << 1;
+
+        err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
+                                          &kernelCode[i], kernelName[i]);
+        if (err)
+        {
+            log_error(" Error creating program for %s\n", type);
+            return -1;
+        }
+
+        for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        {
 
-        loops = ( loops < 5 ? loops : 5 );
-        for ( i = 0; i < loops; i++ ){
-            ii = i << 1;
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
                 buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, hostptr[i], &err);
             else
@@ -612,7 +622,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
             buffers[ii+1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,  ptrSizes[i] * num_elements, outptr[i], &err);
             if ( !buffers[ii+1] || err){
                 print_error(err, "clCreateBuffer failed\n" );
-                clReleaseMemObject( buffers[ii] );
                 align_free( outptr[i] );
                 return -1;
             }
@@ -625,17 +634,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
              */
             if ( err != CL_SUCCESS ){
                 print_error( err, " clEnqueueFillBuffer failed" );
-                clReleaseMemObject( buffers[ii] );
-                clReleaseMemObject( buffers[ii+1] );
-                align_free( outptr[i] );
-                return -1;
-            }
-
-            err = create_single_kernel_helper( context, &program[i], &kernel[i], 1, &kernelCode[i], kernelName[i] );
-            if ( err ){
-                log_error( " Error creating program for %s\n", type );
-                clReleaseMemObject( buffers[ii] );
-                clReleaseMemObject( buffers[ii+1] );
                 align_free( outptr[i] );
                 return -1;
             }
@@ -644,10 +642,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
             err |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), (void *)&buffers[ii+1] );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed" );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
-                clReleaseMemObject( buffers[ii] );
-                clReleaseMemObject( buffers[ii+1] );
                 align_free( outptr[i] );
                 return -1;
             }
@@ -655,14 +649,9 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
             err = clWaitForEvents(  1, &(event[0]) );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clWaitForEvents() failed" );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
-                clReleaseMemObject( buffers[ii] );
-                clReleaseMemObject( buffers[ii+1] );
                 align_free( outptr[i] );
                 return -1;
             }
-            clReleaseEvent(event[0]);
 
             err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
             if (err != CL_SUCCESS){
@@ -680,21 +669,18 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
             if ( err != CL_SUCCESS ){
                 print_error( err, "clWaitForEvents() failed" );
             }
-            clReleaseEvent(event[1]);
 
             if ( fn( inptr[i], outptr[i], (int)(ptrSizes[i] * (size_t)num_elements / ptrSizes[0]) ) ){
-                log_error( " %s%d test failed\n", type, 1<<i );
+                log_error(" %s%d test failed. (cl_mem_flags: %s)\n", type,
+                          1 << i, flag_set_names[src_flag_id]);
                 total_errors++;
             }
             else{
-                log_info( " %s%d test passed\n", type, 1<<i );
+                log_info(" %s%d test passed (cl_mem_flags: %s)\n", type, 1 << i,
+                         flag_set_names[src_flag_id]);
             }
 
             // cleanup
-            clReleaseMemObject( buffers[ii] );
-            clReleaseMemObject( buffers[ii+1] );
-            clReleaseKernel( kernel[i] );
-            clReleaseProgram( program[i] );
             align_free( outptr[i] );
         }
     } // src cl_mem_flag
@@ -706,14 +692,14 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
 
 int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
 {
-    cl_mem      buffers[2];
+    clMemWrapper buffers[2];
     void        *outptr;
     TestStruct  *inptr;
     TestStruct  *hostptr;
-    TestStruct  *pattern;
-    cl_program  program;
-    cl_kernel   kernel;
-    cl_event    event[2];
+    TestStruct pattern;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clEventWrapper event[2];
     size_t      ptrSize = sizeof( TestStruct );
     size_t      global_work_size[3];
     int         n, err;
@@ -726,35 +712,57 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
 
     global_work_size[0] = (size_t)num_elements;
 
-    // Test with random offsets and fill sizes
-    for ( n = 0; n < 8; n++ ){
-        offset_elements = (size_t)get_random_float( 0.f, (float)(num_elements - 8), d );
-        fill_elements = (size_t)get_random_float( 8.f, (float)(num_elements - offset_elements), d );
-        log_info( "Testing random fill from offset %d for %d elements: \n", (int)offset_elements, (int)fill_elements );
-
-        pattern = (TestStruct *)malloc(ptrSize);
-        pattern->a = (cl_int)genrand_int32(d);
-        pattern->b = (cl_float)get_random_float( -FLT_MAX, FLT_MAX, d );
 
-        inptr = (TestStruct *)align_malloc(ptrSize * num_elements, min_alignment);
-        for ( j = 0; j < offset_elements; j++ ) {
-            inptr[j].a = 0;
-            inptr[j].b =0;
-        }
-        for ( j = offset_elements; j < offset_elements + fill_elements; j++ ) {
-            inptr[j].a = pattern->a;
-            inptr[j].b = pattern->b;
-        }
-        for ( j = offset_elements + fill_elements; j < (size_t)num_elements; j++ ) {
-            inptr[j].a = 0;
-            inptr[j].b = 0;
+    for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+    {
+        log_info("Testing with cl_mem_flags: %s\n",
+                 flag_set_names[src_flag_id]);
+
+        err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                          &struct_kernel_code,
+                                          "read_fill_struct");
+        if (err)
+        {
+            log_error(" Error creating program for struct\n");
+            free_mtdata(d);
+            return -1;
         }
 
-        hostptr = (TestStruct *)align_malloc(ptrSize * num_elements, min_alignment);
-        memset(hostptr, 0, ptrSize * num_elements);
+        // Test with random offsets and fill sizes
+        for (n = 0; n < 8; n++)
+        {
+            offset_elements =
+                (size_t)get_random_float(0.f, (float)(num_elements - 8), d);
+            fill_elements = (size_t)get_random_float(
+                8.f, (float)(num_elements - offset_elements), d);
+            log_info("Testing random fill from offset %d for %d elements: \n",
+                     (int)offset_elements, (int)fill_elements);
+
+            pattern.a = (cl_int)genrand_int32(d);
+            pattern.b = (cl_float)get_random_float(-FLT_MAX, FLT_MAX, d);
+
+            inptr = (TestStruct *)align_malloc(ptrSize * num_elements,
+                                               min_alignment);
+            for (j = 0; j < offset_elements; j++)
+            {
+                inptr[j].a = 0;
+                inptr[j].b = 0;
+            }
+            for (j = offset_elements; j < offset_elements + fill_elements; j++)
+            {
+                inptr[j].a = pattern.a;
+                inptr[j].b = pattern.b;
+            }
+            for (j = offset_elements + fill_elements; j < (size_t)num_elements;
+                 j++)
+            {
+                inptr[j].a = 0;
+                inptr[j].b = 0;
+            }
 
-        for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-            log_info("Testing with cl_mem_flags: %s\n", flag_set_names[src_flag_id]);
+            hostptr = (TestStruct *)align_malloc(ptrSize * num_elements,
+                                                 min_alignment);
+            memset(hostptr, 0, ptrSize * num_elements);
 
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
                 buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSize * num_elements, hostptr, &err);
@@ -762,9 +770,6 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
                 buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSize * num_elements, NULL, &err);
             if ( err ){
                 print_error(err, " clCreateBuffer failed\n" );
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
                 align_free( (void *)inptr );
                 align_free( (void *)hostptr );
                 free_mtdata(d);
@@ -774,9 +779,6 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
                 err = clEnqueueWriteBuffer(queue, buffers[0], CL_FALSE, 0, ptrSize * num_elements, hostptr, 0, NULL, NULL);
                 if ( err != CL_SUCCESS ){
                     print_error(err, " clEnqueueWriteBuffer failed\n" );
-                    clReleaseEvent( event[0] );
-                    clReleaseEvent( event[1] );
-                    free( (void *)pattern );
                     align_free( (void *)inptr );
                     align_free( (void *)hostptr );
                     free_mtdata(d);
@@ -789,45 +791,21 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
             if ( ! buffers[1] || err){
                 print_error(err, " clCreateBuffer failed\n" );
                 align_free( outptr );
-                clReleaseMemObject(buffers[0]);
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
                 align_free( (void *)inptr );
                 align_free( (void *)hostptr );
                 free_mtdata(d);
                 return -1;
             }
 
-            err = clEnqueueFillBuffer(queue, buffers[0], pattern, ptrSize,
-                                      ptrSize * offset_elements, ptrSize * fill_elements,
-                                      0, NULL, &(event[0]));
+            err = clEnqueueFillBuffer(
+                queue, buffers[0], &pattern, ptrSize, ptrSize * offset_elements,
+                ptrSize * fill_elements, 0, NULL, &(event[0]));
             /* uncomment for test debugging
              err = clEnqueueWriteBuffer(queue, buffers[0], CL_FALSE, 0, ptrSize * num_elements, inptr, 0, NULL, &(event[0]));
              */
             if ( err != CL_SUCCESS ){
                 print_error( err, " clEnqueueFillBuffer failed" );
                 align_free( outptr );
-                clReleaseMemObject(buffers[0]);
-                clReleaseMemObject(buffers[1]);
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
-                align_free( (void *)inptr );
-                align_free( (void *)hostptr );
-                free_mtdata(d);
-                return -1;
-            }
-
-            err = create_single_kernel_helper( context, &program, &kernel, 1, &struct_kernel_code, "read_fill_struct" );
-            if ( err ){
-                log_error( " Error creating program for struct\n" );
-                align_free( outptr );
-                clReleaseMemObject(buffers[0]);
-                clReleaseMemObject(buffers[1]);
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
                 align_free( (void *)inptr );
                 align_free( (void *)hostptr );
                 free_mtdata(d);
@@ -838,14 +816,7 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
             err |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), (void *)&buffers[1] );
             if ( err != CL_SUCCESS ){
                 print_error( err, " clSetKernelArg failed" );
-                clReleaseKernel( kernel );
-                clReleaseProgram( program );
                 align_free( outptr );
-                clReleaseMemObject(buffers[0]);
-                clReleaseMemObject(buffers[1]);
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
                 align_free( (void *)inptr );
                 align_free( (void *)hostptr );
                 free_mtdata(d);
@@ -855,14 +826,7 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
             err = clWaitForEvents(  1, &(event[0]) );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clWaitForEvents() failed" );
-                clReleaseKernel( kernel );
-                clReleaseProgram( program );
                 align_free( outptr );
-                clReleaseMemObject(buffers[0]);
-                clReleaseMemObject(buffers[1]);
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
                 align_free( (void *)inptr );
                 align_free( (void *)hostptr );
                 free_mtdata(d);
@@ -873,14 +837,7 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
             err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
                 print_error( err, " clEnqueueNDRangeKernel failed" );
-                clReleaseKernel( kernel );
-                clReleaseProgram( program );
                 align_free( outptr );
-                clReleaseMemObject(buffers[0]);
-                clReleaseMemObject(buffers[1]);
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
                 align_free( (void *)inptr );
                 align_free( (void *)hostptr );
                 free_mtdata(d);
@@ -890,14 +847,7 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
             err = clEnqueueReadBuffer( queue, buffers[1], CL_FALSE, 0, ptrSize * num_elements, outptr, 0, NULL, &(event[1]) );
             if ( err != CL_SUCCESS ){
                 print_error( err, " clEnqueueReadBuffer failed" );
-                clReleaseKernel( kernel );
-                clReleaseProgram( program );
                 align_free( outptr );
-                clReleaseMemObject(buffers[0]);
-                clReleaseMemObject(buffers[1]);
-                clReleaseEvent( event[0] );
-                clReleaseEvent( event[1] );
-                free( (void *)pattern );
                 align_free( (void *)inptr );
                 align_free( (void *)hostptr );
                 free_mtdata(d);
@@ -918,15 +868,10 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
                 log_info( " buffer_FILL async struct test passed\n" );
             }
             // cleanup
-            clReleaseKernel( kernel );
-            clReleaseProgram( program );
             align_free( outptr );
-            clReleaseMemObject( buffers[0] );
-            clReleaseMemObject( buffers[1] );
+            align_free((void *)inptr);
+            align_free((void *)hostptr);
         } // src cl_mem_flag
-        free( (void *)pattern );
-        align_free( (void *)inptr );
-        align_free( (void *)hostptr );
     }
 
     free_mtdata(d);
diff --git a/test_conformance/buffers/test_buffer_map.cpp b/test_conformance/buffers/test_buffer_map.cpp
index f0363dd5b5..3cbcd387d7 100644
--- a/test_conformance/buffers/test_buffer_map.cpp
+++ b/test_conformance/buffers/test_buffer_map.cpp
@@ -554,10 +554,10 @@ static int verify_read_struct( void *ptr, int n )
 static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                                  const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    cl_mem      buffers[5];
+    clMemWrapper buffers[5];
     void        *outptr[5];
-    cl_program  program[5];
-    cl_kernel   kernel[5];
+    clProgramWrapper program[5];
+    clKernelWrapper kernel[5];
     size_t      threads[3], localThreads[3];
     cl_int      err;
     int         i;
@@ -580,10 +580,20 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
     if (! gHasLong && strstr(type,"long"))
         return 0;
 
-    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+    for (i = 0; i < loops; i++)
+    {
+
+        err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
+                                          &kernelCode[i], kernelName[i]);
+        if (err)
+        {
+            log_error(" Error creating program for %s\n", type);
+            return -1;
+        }
+
+        for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        {
 
-        for ( i = 0; i < loops; i++ ){
             outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes of memory\n", (int)ptrSizes[i] * num_elements );
@@ -602,20 +612,9 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
                 return -1;
             }
 
-            err = create_single_kernel_helper(context, &program[i], &kernel[i], 1, &kernelCode[i], kernelName[i] );
-            if ( err ){
-                log_error( " Error creating program for %s\n", type );
-                clReleaseMemObject( buffers[i] );
-                align_free( outptr[i] );
-                return -1;
-            }
-
             err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed\n" );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
-                clReleaseMemObject( buffers[i] );
                 align_free( outptr[i] );
                 return -1;
             }
@@ -628,9 +627,6 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
             err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, threads, localThreads, 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueNDRangeKernel failed\n" );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
-                clReleaseMemObject( buffers[i] );
                 align_free( outptr[i] );
                 return -1;
             }
@@ -638,29 +634,23 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
             mappedPtr = clEnqueueMapBuffer(queue, buffers[i], CL_TRUE, CL_MAP_READ, 0, ptrSizes[i]*num_elements, 0, NULL, NULL, &err);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueMapBuffer failed" );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
-                clReleaseMemObject( buffers[i] );
                 align_free( outptr[i] );
                 return -1;
             }
 
             if (fn(mappedPtr, num_elements*(1<<i))){
-                log_error(" %s%d test failed\n", type, 1<<i);
+                log_error(" %s%d test failed. cl_mem_flags src: %s\n", type,
+                          1 << i, flag_set_names[src_flag_id]);
                 total_errors++;
             }
             else{
-                log_info(" %s%d test passed\n", type, 1<<i);
+                log_info(" %s%d test passed. cl_mem_flags src: %s\n", type,
+                         1 << i, flag_set_names[src_flag_id]);
             }
 
             err = clEnqueueUnmapMemObject(queue, buffers[i], mappedPtr, 0, NULL, NULL);
             test_error(err, "clEnqueueUnmapMemObject failed");
 
-            // cleanup
-            clReleaseKernel( kernel[i] );
-            clReleaseProgram( program[i] );
-            clReleaseMemObject( buffers[i] );
-
             // If we are using the outptr[i] as backing via USE_HOST_PTR we need to make sure we are done before freeing.
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR)) {
                 err = clFinish(queue);
diff --git a/test_conformance/buffers/test_buffer_read.cpp b/test_conformance/buffers/test_buffer_read.cpp
index a3d025441d..0e533fa586 100644
--- a/test_conformance/buffers/test_buffer_read.cpp
+++ b/test_conformance/buffers/test_buffer_read.cpp
@@ -621,11 +621,11 @@ static int verify_read_struct(TestStruct *outptr, int n)
 int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                       const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    cl_mem      buffers[5];
+    clMemWrapper buffers[5];
     void        *outptr[5];
     void        *inptr[5];
-    cl_program  program[5];
-    cl_kernel   kernel[5];
+    clProgramWrapper program[5];
+    clKernelWrapper kernel[5];
     size_t      global_work_size[3];
     cl_int      err;
     int         i;
@@ -650,10 +650,21 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
         return CL_SUCCESS;
     }
 
-    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+    for (i = 0; i < loops; i++)
+    {
+
+        err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
+                                          &kernelCode[i], kernelName[i]);
+        if (err)
+        {
+            log_error("Creating program for %s\n", type);
+            print_error(err, " Error creating program ");
+            return -1;
+        }
+
+        for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        {
 
-        for ( i = 0; i < loops; i++ ){
             outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)( ptrSizes[i] * num_elements ) );
@@ -677,22 +688,9 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
                 return -1;
             }
 
-            err = create_single_kernel_helper(  context, &program[i], &kernel[i], 1, &kernelCode[i], kernelName[i] );
-            if ( err ){
-                log_error("Creating program for %s\n", type);
-                print_error(err,  " Error creating program " );
-                clReleaseMemObject(buffers[i]);
-                align_free( outptr[i] );
-                align_free( inptr[i] );
-                return -1;
-            }
-
             err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -701,9 +699,6 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
             err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueNDRangeKernel failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -712,28 +707,24 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
             err = clEnqueueReadBuffer( queue, buffers[i], CL_TRUE, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueReadBuffer failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
             }
 
             if (fn(outptr[i], num_elements*(1<<i))){
-                log_error( " %s%d test failed\n", type, 1<<i );
+                log_error(" %s%d test failed. cl_mem_flags src: %s\n", type,
+                          1 << i, flag_set_names[src_flag_id]);
                 total_errors++;
             }
             else{
-                log_info( " %s%d test passed\n", type, 1<<i );
+                log_info(" %s%d test passed. cl_mem_flags src: %s\n", type,
+                         1 << i, flag_set_names[src_flag_id]);
             }
 
             err = clEnqueueReadBuffer( queue, buffers[i], CL_TRUE, 0, ptrSizes[i]*num_elements, inptr[i], 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueReadBuffer failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -749,9 +740,6 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
 
 
             // cleanup
-            clReleaseMemObject( buffers[i] );
-            clReleaseKernel( kernel[i] );
-            clReleaseProgram( program[i] );
             align_free( outptr[i] );
             align_free( inptr[i] );
         }
@@ -764,10 +752,10 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
 int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                             const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    cl_mem      buffers[5];
-    cl_program  program[5];
-    cl_kernel   kernel[5];
-    cl_event    event;
+    clMemWrapper buffers[5];
+    clProgramWrapper program[5];
+    clKernelWrapper kernel[5];
+    clEventWrapper event;
     void        *outptr[5];
     void        *inptr[5];
     size_t      global_work_size[3];
@@ -795,10 +783,20 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
         return CL_SUCCESS;
     }
 
-    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+    for (i = 0; i < loops; i++)
+    {
+
+        err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
+                                          &kernelCode[i], kernelName[i]);
+        if (err)
+        {
+            log_error(" Error creating program for %s\n", type);
+            return -1;
+        }
+
+        for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        {
 
-        for ( i = 0; i < loops; i++ ){
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
@@ -824,21 +822,9 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
                 return -1;
             }
 
-            err = create_single_kernel_helper( context, &program[i], &kernel[i], 1, &kernelCode[i], kernelName[i]);
-            if ( err ){
-                log_error( " Error creating program for %s\n", type );
-                clReleaseMemObject( buffers[i] );
-                align_free( outptr[i] );
-                align_free( inptr[i] );
-                return -1;
-            }
-
             err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -847,9 +833,6 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
             err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueNDRangeKernel failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -865,9 +848,6 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
 #endif
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueReadBuffer failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -875,27 +855,22 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
             err = clWaitForEvents(1, &event );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clWaitForEvents() failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
             }
 
             if ( fn(outptr[i], num_elements*(1<<i)) ){
-                log_error( " %s%d test failed\n", type, 1<<i );
+                log_error(" %s%d test failed. cl_mem_flags src: %s\n", type,
+                          1 << i, flag_set_names[src_flag_id]);
                 total_errors++;
             }
             else{
-                log_info( " %s%d test passed\n", type, 1<<i );
+                log_info(" %s%d test passed. cl_mem_flags src: %s\n", type,
+                         1 << i, flag_set_names[src_flag_id]);
             }
 
             // cleanup
-            clReleaseEvent( event );
-            clReleaseMemObject( buffers[i] );
-            clReleaseKernel( kernel[i] );
-            clReleaseProgram( program[i] );
             align_free( outptr[i] );
             align_free( inptr[i] );
         }
@@ -910,10 +885,10 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
 int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                                     const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    cl_mem      buffers[5];
-    cl_program  program[5];
-    cl_kernel   kernel[5];
-    cl_event    event;
+    clMemWrapper buffers[5];
+    clProgramWrapper program[5];
+    clKernelWrapper kernel[5];
+    clEventWrapper event;
     void        *outptr[5], *inptr[5];
     size_t      global_work_size[3];
     cl_int      err;
@@ -940,10 +915,20 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
         return CL_SUCCESS;
     }
 
-    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+    for (i = 0; i < loops; i++)
+    {
+
+        err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
+                                          &kernelCode[i], kernelName[i]);
+        if (err)
+        {
+            log_error(" Error creating program for %s\n", type);
+            return -1;
+        }
+
+        for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        {
 
-        for ( i = 0; i < loops; i++ ){
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
@@ -968,21 +953,9 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
                 return -1;
             }
 
-            err = create_single_kernel_helper(  context, &program[i], &kernel[i], 1, &kernelCode[i], kernelName[i] );
-            if ( err ){
-                log_error( " Error creating program for %s\n", type );
-                clReleaseMemObject( buffers[i] );
-                align_free( outptr[i] );
-                align_free( inptr[i] );
-                return -1;
-            }
-
             err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArgs failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -991,9 +964,6 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
             err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueNDRangeKernel failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -1009,9 +979,6 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
 #endif
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueReadBuffer failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
@@ -1019,9 +986,6 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
             err = clEnqueueBarrierWithWaitList(queue, 0, NULL, NULL);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueBarrierWithWaitList() failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 return -1;
             }
@@ -1029,27 +993,22 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
             err = clWaitForEvents(1, &event);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clWaitForEvents() failed" );
-                clReleaseMemObject( buffers[i] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
             }
 
             if ( fn(outptr[i], num_elements*(1<<i)) ){
-                log_error(" %s%d test failed\n", type, 1<<i);
+                log_error(" %s%d test failed. cl_mem_flags src: %s\n", type,
+                          1 << i, flag_set_names[src_flag_id]);
                 total_errors++;
             }
             else{
-                log_info(" %s%d test passed\n", type, 1<<i);
+                log_info(" %s%d test passed. cl_mem_flags src: %s\n", type,
+                         1 << i, flag_set_names[src_flag_id]);
             }
 
             // cleanup
-            clReleaseEvent( event );
-            clReleaseMemObject( buffers[i] );
-            clReleaseKernel( kernel[i] );
-            clReleaseProgram( program[i] );
             align_free( outptr[i] );
             align_free( inptr[i] );
         }
diff --git a/test_conformance/buffers/test_buffer_write.cpp b/test_conformance/buffers/test_buffer_write.cpp
index 83b177957b..2497dd1765 100644
--- a/test_conformance/buffers/test_buffer_write.cpp
+++ b/test_conformance/buffers/test_buffer_write.cpp
@@ -624,10 +624,10 @@ static int verify_write_struct( void *ptr1, void *ptr2, int n )
 int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                        void *inptr[5], const char *kernelCode[], const char *kernelName[], int (*fn)(void *,void *,int), MTdata d )
 {
-    cl_mem      buffers[10];
+    clMemWrapper buffers[10];
     void        *outptr[5];
-    cl_program  program[5];
-    cl_kernel   kernel[5];
+    clProgramWrapper program[5];
+    clKernelWrapper kernel[5];
     size_t      ptrSizes[5];
     size_t      global_work_size[3];
     cl_int      err;
@@ -645,12 +645,21 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
     ptrSizes[3] = ptrSizes[2] << 1;
     ptrSizes[4] = ptrSizes[3] << 1;
 
-    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-        for (dst_flag_id=0; dst_flag_id < NUM_FLAGS; dst_flag_id++) {
-            log_info("Testing with cl_mem_flags src: %s dst: %s\n", flag_set_names[src_flag_id], flag_set_names[dst_flag_id]);
+    loops = (loops < 5 ? loops : 5);
+    for (i = 0; i < loops; i++)
+    {
+        err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
+                                          &kernelCode[i], kernelName[i]);
+        if (err)
+        {
+            log_error(" Error creating program for %s\n", type);
+            return -1;
+        }
 
-            loops = ( loops < 5 ? loops : 5 );
-            for ( i = 0; i < loops; i++ ){
+        for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        {
+            for (dst_flag_id = 0; dst_flag_id < NUM_FLAGS; dst_flag_id++)
+            {
                 ii = i << 1;
                 if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
                     buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, inptr[i], &err);
@@ -688,8 +697,6 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                     dataPtr = clEnqueueMapBuffer(queue, buffers[ii], CL_TRUE, CL_MAP_WRITE, 0, ptrSizes[i]*num_elements, 0, NULL, NULL, &err);
                     if (err) {
                         print_error(err, "clEnqueueMapBuffer failed");
-                        clReleaseMemObject(buffers[ii]);
-                        clReleaseMemObject(buffers[ii+1]);
                         align_free( outptr[i] );
                         return -1;
                     }
@@ -699,8 +706,6 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                     err = clEnqueueUnmapMemObject(queue, buffers[ii], dataPtr, 0, NULL, NULL);
                     if (err) {
                         print_error(err, "clEnqueueUnmapMemObject failed");
-                        clReleaseMemObject(buffers[ii]);
-                        clReleaseMemObject(buffers[ii+1]);
                         align_free( outptr[i] );
                         return -1;
                     }
@@ -708,30 +713,15 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                 else if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR)) {
                     err = clEnqueueWriteBuffer(queue, buffers[ii], CL_TRUE, 0, ptrSizes[i]*num_elements, inptr[i], 0, NULL, NULL);
                     if ( err != CL_SUCCESS ){
-                        clReleaseMemObject(buffers[ii]);
-                        clReleaseMemObject(buffers[ii+1]);
                         align_free( outptr[i] );
                         print_error( err, " clWriteBuffer failed" );
                         return -1;
                     }
                 }
 
-                err = create_single_kernel_helper( context, &program[i], &kernel[i], 1, &kernelCode[i], kernelName[i] );
-                if ( err ){
-                    clReleaseMemObject(buffers[ii]);
-                    clReleaseMemObject(buffers[ii+1]);
-                    align_free( outptr[i] );
-                    log_error( " Error creating program for %s\n", type );
-                    return -1;
-                }
-
                 err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[ii] );
                 err |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), (void *)&buffers[ii+1] );
                 if ( err != CL_SUCCESS ){
-                    clReleaseMemObject( buffers[ii] );
-                    clReleaseMemObject( buffers[ii+1] );
-                    clReleaseKernel( kernel[i] );
-                    clReleaseProgram( program[i] );
                     align_free( outptr[i] );
                     print_error( err, " clSetKernelArg failed" );
                     return -1;
@@ -740,10 +730,6 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                 err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
                 if ( err != CL_SUCCESS ){
                     print_error( err, " clEnqueueNDRangeKernel failed" );
-                    clReleaseMemObject( buffers[ii] );
-                    clReleaseMemObject( buffers[ii+1] );
-                    clReleaseKernel( kernel[i] );
-                    clReleaseProgram( program[i] );
                     align_free( outptr[i] );
                     return -1;
                 }
@@ -755,27 +741,25 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                     err = clEnqueueReadBuffer( queue, buffers[ii+1], true, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, NULL );
                 }
                 if ( err != CL_SUCCESS ){
-                    clReleaseMemObject( buffers[ii] );
-                    clReleaseMemObject( buffers[ii+1] );
-                    clReleaseKernel( kernel[i] );
-                    clReleaseProgram( program[i] );
                     align_free( outptr[i] );
                     print_error( err, " clEnqueueReadBuffer failed" );
                     return -1;
                 }
 
                 if ( fn( inptr[i], outptr[i], (int)(ptrSizes[i] * (size_t)num_elements / ptrSizes[0]) ) ){
-                    log_error( " %s%d test failed\n", type, 1<<i );
+                    log_error(
+                        " %s%d test failed. cl_mem_flags src: %s dst: %s\n",
+                        type, 1 << i, flag_set_names[src_flag_id],
+                        flag_set_names[dst_flag_id]);
                     total_errors++;
                 }
                 else{
-                    log_info( " %s%d test passed\n", type, 1<<i );
+                    log_info(
+                        " %s%d test passed. cl_mem_flags src: %s dst: %s\n",
+                        type, 1 << i, flag_set_names[src_flag_id],
+                        flag_set_names[dst_flag_id]);
                 }
                 // cleanup
-                clReleaseMemObject( buffers[ii] );
-                clReleaseMemObject( buffers[ii+1] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
             }
         } // dst cl_mem_flag
@@ -790,11 +774,11 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
 
 int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
 {
-    cl_mem      buffers[10];
+    clMemWrapper buffers[10];
     void        *outptr[5];
     TestStruct  *inptr[5];
-    cl_program  program[5];
-    cl_kernel   kernel[5];
+    clProgramWrapper program[5];
+    clKernelWrapper kernel[5];
     size_t      ptrSizes[5];
     size_t      size = sizeof( TestStruct );
     size_t      global_work_size[3];
@@ -816,12 +800,24 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
     ptrSizes[3] = ptrSizes[2] << 1;
     ptrSizes[4] = ptrSizes[3] << 1;
 
-    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
-        for (dst_flag_id=0; dst_flag_id < NUM_FLAGS; dst_flag_id++) {
-            log_info("Testing with cl_mem_flags src: %s dst: %s\n", flag_set_names[src_flag_id], flag_set_names[dst_flag_id]);
+    loops = (loops < 5 ? loops : 5);
+    for (i = 0; i < loops; i++)
+    {
 
-            loops = ( loops < 5 ? loops : 5 );
-            for ( i = 0; i < loops; i++ ){
+        err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
+                                          &struct_kernel_code,
+                                          "read_write_struct");
+        if (err)
+        {
+            log_error(" Error creating program for struct\n");
+            free_mtdata(d);
+            return -1;
+        }
+
+        for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
+        {
+            for (dst_flag_id = 0; dst_flag_id < NUM_FLAGS; dst_flag_id++)
+            {
 
                 inptr[i] = (TestStruct *)align_malloc(ptrSizes[i] * num_elements, min_alignment);
 
@@ -847,7 +843,6 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                 else
                     buffers[ii+1] = clCreateBuffer(context, flag_set[dst_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
                 if ( ! buffers[ii+1] || err){
-                    clReleaseMemObject(buffers[ii]);
                     align_free( outptr[i] );
                     print_error(err, " clCreateBuffer failed\n" );
                     free_mtdata(d);
@@ -859,8 +854,6 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                     dataPtr = clEnqueueMapBuffer(queue, buffers[ii], CL_TRUE, CL_MAP_WRITE, 0, ptrSizes[i]*num_elements, 0, NULL, NULL, &err);
                     if (err) {
                         print_error(err, "clEnqueueMapBuffer failed");
-                        clReleaseMemObject(buffers[ii]);
-                        clReleaseMemObject(buffers[ii+1]);
                         align_free( outptr[i] );
                         free_mtdata(d);
                         return -1;
@@ -871,8 +864,6 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                     err = clEnqueueUnmapMemObject(queue, buffers[ii], dataPtr, 0, NULL, NULL);
                     if (err) {
                         print_error(err, "clEnqueueUnmapMemObject failed");
-                        clReleaseMemObject(buffers[ii]);
-                        clReleaseMemObject(buffers[ii+1]);
                         align_free( outptr[i] );
                         free_mtdata(d);
                         return -1;
@@ -881,8 +872,6 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                 else if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR)) {
                     err = clEnqueueWriteBuffer(queue, buffers[ii], CL_TRUE, 0, ptrSizes[i]*num_elements, inptr[i], 0, NULL, NULL);
                     if ( err != CL_SUCCESS ){
-                        clReleaseMemObject(buffers[ii]);
-                        clReleaseMemObject(buffers[ii+1]);
                         align_free( outptr[i] );
                         print_error( err, " clWriteBuffer failed" );
                         free_mtdata(d);
@@ -890,23 +879,9 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                     }
                 }
 
-                err = create_single_kernel_helper( context, &program[i], &kernel[i], 1, &struct_kernel_code, "read_write_struct" );
-                if ( err ){
-                    clReleaseMemObject(buffers[ii]);
-                    clReleaseMemObject(buffers[ii+1]);
-                    align_free( outptr[i] );
-                    log_error( " Error creating program for struct\n" );
-                    free_mtdata(d);
-                    return -1;
-                }
-
                 err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[ii] );
                 err |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), (void *)&buffers[ii+1] );
                 if ( err != CL_SUCCESS ){
-                    clReleaseMemObject( buffers[ii] );
-                    clReleaseMemObject( buffers[ii+1] );
-                    clReleaseKernel( kernel[i] );
-                    clReleaseProgram( program[i] );
                     align_free( outptr[i] );
                     print_error( err, " clSetKernelArg failed" );
                     free_mtdata(d);
@@ -916,10 +891,6 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                 err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
                 if ( err != CL_SUCCESS ){
                     print_error( err, " clEnqueueNDRangeKernel failed" );
-                    clReleaseMemObject( buffers[ii] );
-                    clReleaseMemObject( buffers[ii+1] );
-                    clReleaseKernel( kernel[i] );
-                    clReleaseProgram( program[i] );
                     align_free( outptr[i] );
                     free_mtdata(d);
                     return -1;
@@ -927,10 +898,6 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
 
                 err = clEnqueueReadBuffer( queue, buffers[ii+1], true, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, NULL );
                 if ( err != CL_SUCCESS ){
-                    clReleaseMemObject( buffers[ii] );
-                    clReleaseMemObject( buffers[ii+1] );
-                    clReleaseKernel( kernel[i] );
-                    clReleaseProgram( program[i] );
                     align_free( outptr[i] );
                     print_error( err, " clEnqueueReadBuffer failed" );
                     free_mtdata(d);
@@ -938,17 +905,19 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                 }
 
                 if ( verify_write_struct( inptr[i], outptr[i], (int)(ptrSizes[i] * (size_t)num_elements / ptrSizes[0]) ) ){
-                    log_error( " buffer_WRITE struct%d test failed\n", 1<<i );
+                    log_error(" buffer_WRITE struct%d test failed. "
+                              "cl_mem_flags src: %s dst: %s\n",
+                              1 << i, flag_set_names[src_flag_id],
+                              flag_set_names[dst_flag_id]);
                     total_errors++;
                 }
                 else{
-                    log_info( " buffer_WRITE struct%d test passed\n", 1<<i );
+                    log_info(" buffer_WRITE struct%d test passed. cl_mem_flags "
+                             "src: %s dst: %s\n",
+                             1 << i, flag_set_names[src_flag_id],
+                             flag_set_names[dst_flag_id]);
                 }
                 // cleanup
-                clReleaseMemObject( buffers[ii] );
-                clReleaseMemObject( buffers[ii+1] );
-                clReleaseKernel( kernel[i] );
-                clReleaseProgram( program[i] );
                 align_free( outptr[i] );
                 align_free( (void *)inptr[i] );
             }

From 3615c4eea6d6057f7b5cdd0f79ab474f6c528c80 Mon Sep 17 00:00:00 2001
From: Jeremy Kemp <jeremy@jeremykemp.co.uk>
Date: Wed, 23 Dec 2020 22:05:38 +0000
Subject: [PATCH 002/158] Use memcmp for select verification. (#1084)

* Use memcmp for select verification.

If memcmp fails, fall back to looping through the result buffer to find the incorrect result.

* Removed extra prints for check_int.
---
 test_conformance/select/util_select.cpp | 205 ++++++++++++++----------
 1 file changed, 122 insertions(+), 83 deletions(-)

diff --git a/test_conformance/select/util_select.cpp b/test_conformance/select/util_select.cpp
index 71c58bc2fb..f9641e9938 100644
--- a/test_conformance/select/util_select.cpp
+++ b/test_conformance/select/util_select.cpp
@@ -561,13 +561,18 @@ size_t check_uchar(void *test, void *correct, size_t count, size_t vector_size)
     const cl_uchar *c = (const cl_uchar *) correct;
     size_t i;
 
-    for(i = 0; i < count; i++)
-        if (t[i] != c[i]) {
-            log_error("\n(check_uchar) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%2.2x vs 0x%2.2x\n", vector_size, i, count, c[i], t[i]);
-            return i + 1;
-        }
-
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
+                log_error("\n(check_uchar) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%2.2x vs 0x%2.2x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
     return 0;
 }
 
@@ -576,13 +581,18 @@ size_t check_char(void *test, void *correct, size_t count, size_t vector_size) {
     const cl_char *c = (const cl_char *) correct;
     size_t i;
 
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] ) {
-            log_error("\n(check_char) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%2.2x vs 0x%2.2x\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
+                log_error("\n(check_char) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%2.2x vs 0x%2.2x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }
@@ -592,13 +602,18 @@ size_t check_ushort(void *test, void *correct, size_t count, size_t vector_size)
     const cl_ushort *c = (const cl_ushort *) correct;
     size_t i;
 
-
-    for( i = 0; i < count; i++ )
-        if(t[i] != c[i]) {
-            log_error("\n(check_ushort) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%4.4x vs 0x%4.4x\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
+                log_error("\n(check_ushort) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%4.4x vs 0x%4.4x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }
@@ -608,13 +623,18 @@ size_t check_short(void *test, void *correct, size_t count, size_t vector_size)
     const cl_short *c = (const cl_short *) correct;
     size_t i;
 
-
-    for (i = 0; i < count; i++)
-        if(t[i] != c[i]) {
-            log_error("\n(check_short) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%8.8x vs 0x%8.8x\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
+                log_error("\n(check_short) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%8.8x vs 0x%8.8x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }
@@ -624,14 +644,18 @@ size_t check_uint(void *test, void *correct, size_t count, size_t vector_size) {
     const cl_uint *c = (const cl_uint *) correct;
     size_t i;
 
-
-
-    for (i = 0; i < count; i++)
-        if(t[i] != c[i]) {
-            log_error("\n(check_uint) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%8.8x vs 0x%8.8x\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
+                log_error("\n(check_uint) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%8.8x vs 0x%8.8x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }
@@ -641,24 +665,19 @@ size_t check_int(void *test, void *correct, size_t count, size_t vector_size) {
     const cl_int *c = (const cl_int *) correct;
     size_t i;
 
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
 
-    for(i = 0; i < count; i++)
-        if( t[i] != c[i] ) {
-
-            log_error("\n(check_int) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%8.8x vs 0x%8.8x\n", vector_size, i, count, c[i], t[i]);
-            log_error("\n(check_int) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%8.8x vs 0x%8.8x\n", vector_size, i+1, count,c[i+1], t[i+1]);
-            log_error("\n(check_int) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%8.8x vs 0x%8.8x\n", vector_size, i+2, count,c[i+2], t[i+2]);
-            log_error("\n(check_int) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%8.8x vs 0x%8.8x\n", vector_size, i+3, count,c[i+3], t[i+3]);
-            if(i) {
-                log_error("\n(check_int) Error for vector size %ld found just after 0x%8.8lx:  "
-                          "*0x%8.8x vs 0x%8.8x\n", vector_size, i-1, c[i-1], t[i-1]);
+                log_error("\n(check_int) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%8.8x vs 0x%8.8x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
             }
-            return i + 1;
-        }
+    }
 
     return 0;
 }
@@ -668,13 +687,18 @@ size_t check_ulong(void *test, void *correct, size_t count, size_t vector_size)
     const cl_ulong *c = (const cl_ulong *) correct;
     size_t i;
 
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] ) {
-            log_error("\n(check_ulong) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%16.16llx vs 0x%16.16llx\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
+                log_error("\n(check_ulong) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%16.16llx vs 0x%16.16llx\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }
@@ -684,13 +708,18 @@ size_t check_long(void *test, void *correct, size_t count, size_t vector_size) {
     const cl_long *c = (const cl_long *) correct;
     size_t i;
 
-
-    for(i = 0; i < count; i++ )
-        if(t[i] != c[i]) {
-            log_error("\n(check_long) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%16.16llx vs 0x%16.16llx\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++)
+            if (t[i] != c[i])
+            {
+                log_error("\n(check_long) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%16.16llx vs 0x%16.16llx\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }
@@ -700,14 +729,19 @@ size_t check_float( void *test, void *correct, size_t count, size_t vector_size
     const cl_uint *c = (const cl_uint *) correct;
     size_t i;
 
-
-    for( i = 0; i < count; i++ )
-        /* Allow nans to be binary different */
-        if ((t[i] != c[i]) && !(isnan(((float *)correct)[i]) && isnan(((float *)test)[i]))) {
-            log_error("\n(check_float) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%8.8x vs 0x%8.8x\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++) /* Allow nans to be binary different */
+            if ((t[i] != c[i])
+                && !(isnan(((float *)correct)[i]) && isnan(((float *)test)[i])))
+            {
+                log_error("\n(check_float) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%8.8x vs 0x%8.8x\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }
@@ -717,15 +751,20 @@ size_t check_double( void *test, void *correct, size_t count, size_t vector_size
     const cl_ulong *c = (const cl_ulong *) correct;
     size_t i;
 
-
-
-    for( i = 0; i < count; i++ )
-        /* Allow nans to be binary different */
-        if ((t[i] != c[i]) && !(isnan(((double *)correct)[i]) && isnan(((double *)test)[i]))) {
-            log_error("\n(check_double) Error for vector size %ld found at 0x%8.8lx (of 0x%8.8lx):  "
-                      "*0x%16.16llx vs 0x%16.16llx\n", vector_size, i, count, c[i], t[i] );
-            return i + 1;
-        }
+    if (memcmp(t, c, count * sizeof(c[0])) != 0)
+    {
+        for (i = 0; i < count; i++) /* Allow nans to be binary different */
+            if ((t[i] != c[i])
+                && !(isnan(((double *)correct)[i])
+                     && isnan(((double *)test)[i])))
+            {
+                log_error("\n(check_double) Error for vector size %ld found at "
+                          "0x%8.8lx (of 0x%8.8lx):  "
+                          "*0x%16.16llx vs 0x%16.16llx\n",
+                          vector_size, i, count, c[i], t[i]);
+                return i + 1;
+            }
+    }
 
     return 0;
 }

From 72998af43c4fcc256509eeb42fd60afeb1f4aea1 Mon Sep 17 00:00:00 2001
From: ellnor01 <51320439+ellnor01@users.noreply.github.com>
Date: Mon, 4 Jan 2021 15:00:45 +0000
Subject: [PATCH 003/158] Making object queries tests exhaustive (#1008)

* Making object queries tests exhaustive

Tests which were only testing fixed values have been made exhaustive
and refactored. They only use properties from the core spec.

Associated Macros have been turned into templated functions.

Comparison of devices with cl_device_id rather than vendor ids.

All object queries tests are now in test_queries.cpp.

Fixes #508

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Remove unnecessary arguments from test_queries.cpp functions

Fixes #508

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Using test_assert_error in test_queries.cpp

This commit also fixes the queue properties array and
corrects for on device queues being optional in CL3.0

test_queries_compatibility.cpp has been removed as
tests are now in test_queries.cpp

Contributes #508

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Correct api test boolean for device queue support

Contributes #508

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>
---
 test_conformance/api/CMakeLists.txt           |   1 -
 test_conformance/api/test_queries.cpp         | 361 ++++++++++++------
 .../api/test_queries_compatibility.cpp        | 169 --------
 3 files changed, 245 insertions(+), 286 deletions(-)
 delete mode 100644 test_conformance/api/test_queries_compatibility.cpp

diff --git a/test_conformance/api/CMakeLists.txt b/test_conformance/api/CMakeLists.txt
index eedf6b490d..66efcc7b57 100644
--- a/test_conformance/api/CMakeLists.txt
+++ b/test_conformance/api/CMakeLists.txt
@@ -7,7 +7,6 @@ set(${MODULE_NAME}_SOURCES
          test_retain.cpp
          test_retain_program.cpp
          test_queries.cpp
-         test_queries_compatibility.cpp
          test_create_kernels.cpp
          test_kernels.cpp
          test_kernel_private_memory_size.cpp
diff --git a/test_conformance/api/test_queries.cpp b/test_conformance/api/test_queries.cpp
index ed01e89497..91678a20df 100644
--- a/test_conformance/api/test_queries.cpp
+++ b/test_conformance/api/test_queries.cpp
@@ -131,86 +131,91 @@ int test_get_platform_info(cl_device_id deviceID, cl_context context, cl_command
     return 0;
 }
 
-int test_get_sampler_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+template <typename T>
+int sampler_param_test(cl_sampler sampler, cl_sampler_info param_name,
+                       T expected, const char *name)
 {
-    int error;
     size_t size;
-
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
-
-    cl_sampler_properties properties[] = {
-        CL_SAMPLER_NORMALIZED_COORDS, CL_TRUE,
-        CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_CLAMP,
-        CL_SAMPLER_FILTER_MODE, CL_FILTER_LINEAR,
-        0 };
-    clSamplerWrapper sampler = clCreateSamplerWithProperties(context, properties, &error);
-    test_error( error, "Unable to create sampler to test with" );
-
-    cl_uint refCount;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
-    test_error( error, "Unable to get sampler ref count" );
-    if( size != sizeof( refCount ) )
+    T val;
+    int error = clGetSamplerInfo(sampler, param_name, sizeof(val), &val, &size);
+    test_error(error, "Unable to get sampler info");
+    if (val != expected)
     {
-        log_error( "ERROR: Returned size of sampler refcount does not validate! (expected %d, got %d)\n", (int)sizeof( refCount ), (int)size );
-        return -1;
+        test_fail("ERROR: Sampler %s did not validate!\n", name);
     }
-
-    cl_context otherCtx;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_CONTEXT, sizeof( otherCtx ), &otherCtx, &size );
-    test_error( error, "Unable to get sampler context" );
-    if( otherCtx != context )
-    {
-        log_error( "ERROR: Sampler context does not validate! (expected %p, got %p)\n", context, otherCtx );
-        return -1;
-    }
-    if( size != sizeof( otherCtx ) )
+    if (size != sizeof(val))
     {
-        log_error( "ERROR: Returned size of sampler context does not validate! (expected %d, got %d)\n", (int)sizeof( otherCtx ), (int)size );
-        return -1;
+        test_fail("ERROR: Returned size of sampler %s does not validate! "
+                  "(expected %d, got %d)\n",
+                  name, (int)sizeof(val), (int)size);
     }
+    return 0;
+}
 
-    cl_addressing_mode mode;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_ADDRESSING_MODE, sizeof( mode ), &mode, &size );
-    test_error( error, "Unable to get sampler addressing mode" );
-    if( mode != CL_ADDRESS_CLAMP )
-    {
-        log_error( "ERROR: Sampler addressing mode does not validate! (expected %d, got %d)\n", (int)CL_ADDRESS_CLAMP, (int)mode );
-        return -1;
-    }
-    if( size != sizeof( mode ) )
-    {
-        log_error( "ERROR: Returned size of sampler addressing mode does not validate! (expected %d, got %d)\n", (int)sizeof( mode ), (int)size );
-        return -1;
-    }
+static cl_int normalized_coord_values[] = { CL_TRUE, CL_FALSE };
+static cl_addressing_mode addressing_mode_values[] = {
+    CL_ADDRESS_NONE, CL_ADDRESS_CLAMP_TO_EDGE, CL_ADDRESS_CLAMP,
+    CL_ADDRESS_REPEAT, CL_ADDRESS_MIRRORED_REPEAT
+};
+static cl_filter_mode filter_mode_values[] = { CL_FILTER_NEAREST,
+                                               CL_FILTER_LINEAR };
 
-    cl_filter_mode fmode;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_FILTER_MODE, sizeof( fmode ), &fmode, &size );
-    test_error( error, "Unable to get sampler filter mode" );
-    if( fmode != CL_FILTER_LINEAR )
-    {
-        log_error( "ERROR: Sampler filter mode does not validate! (expected %d, got %d)\n", (int)CL_FILTER_LINEAR, (int)fmode );
-        return -1;
-    }
-    if( size != sizeof( fmode ) )
-    {
-        log_error( "ERROR: Returned size of sampler filter mode does not validate! (expected %d, got %d)\n", (int)sizeof( fmode ), (int)size );
-        return -1;
-    }
+int test_sampler_params(cl_device_id deviceID, cl_context context,
+                        bool is_compatibility, int norm_coord_num,
+                        int addr_mod_num, int filt_mod_num)
+{
+    cl_uint refCount;
+    size_t size;
+    int error;
 
-    cl_int norm;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_NORMALIZED_COORDS, sizeof( norm ), &norm, &size );
-    test_error( error, "Unable to get sampler normalized flag" );
-    if( norm != CL_TRUE )
+    clSamplerWrapper sampler;
+    cl_sampler_properties properties[] = {
+        CL_SAMPLER_NORMALIZED_COORDS,
+        normalized_coord_values[norm_coord_num],
+        CL_SAMPLER_ADDRESSING_MODE,
+        addressing_mode_values[addr_mod_num],
+        CL_SAMPLER_FILTER_MODE,
+        filter_mode_values[filt_mod_num],
+        0
+    };
+
+    if (is_compatibility)
     {
-        log_error( "ERROR: Sampler normalized flag does not validate! (expected %d, got %d)\n", (int)CL_TRUE, (int)norm );
-        return -1;
+        sampler =
+            clCreateSampler(context, normalized_coord_values[norm_coord_num],
+                            addressing_mode_values[addr_mod_num],
+                            filter_mode_values[filt_mod_num], &error);
+        test_error(error, "Unable to create sampler to test with");
     }
-    if( size != sizeof( norm ) )
+    else
     {
-        log_error( "ERROR: Returned size of sampler normalized flag does not validate! (expected %d, got %d)\n", (int)sizeof( norm ), (int)size );
-        return -1;
+        sampler = clCreateSamplerWithProperties(context, properties, &error);
+        test_error(error, "Unable to create sampler to test with");
     }
 
+    error = clGetSamplerInfo(sampler, CL_SAMPLER_REFERENCE_COUNT,
+                             sizeof(refCount), &refCount, &size);
+    test_error(error, "Unable to get sampler ref count");
+    test_assert_error(size == sizeof(refCount),
+                      "Returned size of sampler refcount does not validate!\n");
+
+    error = sampler_param_test(sampler, CL_SAMPLER_CONTEXT, context, "context");
+    test_error(error, "param checking failed");
+
+    error = sampler_param_test(sampler, CL_SAMPLER_ADDRESSING_MODE,
+                               addressing_mode_values[addr_mod_num],
+                               "addressing mode");
+    test_error(error, "param checking failed");
+
+    error = sampler_param_test(sampler, CL_SAMPLER_FILTER_MODE,
+                               filter_mode_values[filt_mod_num], "filter mode");
+    test_error(error, "param checking failed");
+
+    error = sampler_param_test(sampler, CL_SAMPLER_NORMALIZED_COORDS,
+                               normalized_coord_values[norm_coord_num],
+                               "normalized coords");
+    test_error(error, "param checking failed");
+
     Version version = get_device_cl_version(deviceID);
     if (version >= Version(3, 0))
     {
@@ -244,79 +249,203 @@ int test_get_sampler_info(cl_device_id deviceID, cl_context context, cl_command_
         error = compareProperties(check_properties, test_properties);
         test_error(error, "checkProperties mismatch.");
     }
+    return 0;
+}
 
+int get_sampler_info_params(cl_device_id deviceID, cl_context context,
+                            bool is_compatibility)
+{
+    for (int norm_coord_num = 0;
+         norm_coord_num < ARRAY_SIZE(normalized_coord_values); norm_coord_num++)
+    {
+        for (int addr_mod_num = 0;
+             addr_mod_num < ARRAY_SIZE(addressing_mode_values); addr_mod_num++)
+        {
+            if ((normalized_coord_values[norm_coord_num] == CL_FALSE)
+                && ((addressing_mode_values[addr_mod_num] == CL_ADDRESS_REPEAT)
+                    || (addressing_mode_values[addr_mod_num]
+                        == CL_ADDRESS_MIRRORED_REPEAT)))
+            {
+                continue;
+            }
+            for (int filt_mod_num = 0;
+                 filt_mod_num < ARRAY_SIZE(filter_mode_values); filt_mod_num++)
+            {
+                int err = test_sampler_params(deviceID, context,
+                                              is_compatibility, norm_coord_num,
+                                              addr_mod_num, filt_mod_num);
+                test_error(err, "testing clGetSamplerInfo params failed");
+            }
+        }
+    }
     return 0;
 }
+int test_get_sampler_info(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+{
+    int error;
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
 
-#define TEST_COMMAND_QUEUE_PARAM( queue, paramName, val, expected, name, type, cast )    \
-error = clGetCommandQueueInfo( queue, paramName, sizeof( val ), &val, &size );        \
-test_error( error, "Unable to get command queue " name );                            \
-if( val != expected )                                                                \
-{                                                                                    \
-log_error( "ERROR: Command queue " name " did not validate! (expected " type ", got " type ")\n", (cast)expected, (cast)val );    \
-return -1;                                                                        \
-}            \
-if( size != sizeof( val ) )                \
-{                                        \
-log_error( "ERROR: Returned size of command queue " name " does not validate! (expected %d, got %d)\n", (int)sizeof( val ), (int)size );    \
-return -1;    \
+    error = get_sampler_info_params(deviceID, context, false);
+    test_error(error, "Test Failed");
+
+    return 0;
 }
 
-int test_get_command_queue_info(cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements)
+int test_get_sampler_info_compatibility(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
 {
     int error;
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
+
+    error = get_sampler_info_params(deviceID, context, true);
+    test_error(error, "Test Failed");
+
+    return 0;
+}
+
+template <typename T>
+int command_queue_param_test(cl_command_queue queue,
+                             cl_command_queue_info param_name, T expected,
+                             const char *name)
+{
     size_t size;
+    T val;
+    int error =
+        clGetCommandQueueInfo(queue, param_name, sizeof(val), &val, &size);
+    test_error(error, "Unable to get command queue info");
+    if (val != expected)
+    {
+        test_fail("ERROR: Command queue %s did not validate!\n", name);
+    }
+    if (size != sizeof(val))
+    {
+        test_fail("ERROR: Returned size of command queue %s does not validate! "
+                  "(expected %d, got %d)\n",
+                  name, (int)sizeof(val), (int)size);
+    }
+    return 0;
+}
 
-    cl_queue_properties device_props;
-    cl_queue_properties queue_props[] = {CL_QUEUE_PROPERTIES,0,0};
+#define MIN_NUM_COMMAND_QUEUE_PROPERTIES 2
+#define OOO_NUM_COMMAND_QUEUE_PROPERTIES 4
+static cl_command_queue_properties property_options[] = {
+    0,
 
-    clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, sizeof(device_props), &device_props, NULL);
-    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n", (int)device_props);
+    CL_QUEUE_PROFILING_ENABLE,
 
-    // Mask off vendor extension properties.  Only test standard OpenCL properties
-    device_props &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_PROFILING_ENABLE;
+    CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
 
-    queue_props[1] = device_props;
-    clCommandQueueWrapper queue = clCreateCommandQueueWithProperties( context, deviceID, &queue_props[0], &error );
-    test_error( error, "Unable to create command queue to test with" );
+    CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
 
-    cl_uint refCount;
-    error = clGetCommandQueueInfo( queue, CL_QUEUE_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
-    test_error( error, "Unable to get command queue reference count" );
-    if( size != sizeof( refCount ) )
-    {
-        log_error( "ERROR: Returned size of command queue reference count does not validate! (expected %d, got %d)\n", (int)sizeof( refCount ), (int)size );
-        return -1;
-    }
+    CL_QUEUE_ON_DEVICE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
 
-    cl_context otherCtx;
-    TEST_COMMAND_QUEUE_PARAM( queue, CL_QUEUE_CONTEXT, otherCtx, context, "context", "%p", cl_context )
+    CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE
+        | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
 
-    cl_device_id otherDevice;
-    error = clGetCommandQueueInfo( queue, CL_QUEUE_DEVICE, sizeof(otherDevice), &otherDevice, &size);
-    test_error(error, "clGetCommandQueue failed.");
+    CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT
+        | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
 
-    if (size != sizeof(cl_device_id)) {
-        log_error( " ERROR: Returned size of command queue CL_QUEUE_DEVICE does not validate! (expected %d, got %d)\n", (int)sizeof( otherDevice ), (int)size );
-        return -1;
-    }
+    CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT
+        | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+};
+
+int check_get_command_queue_info_params(cl_device_id deviceID,
+                                        cl_context context,
+                                        bool is_compatibility)
+{
+    int error;
+    size_t size;
+
+    cl_queue_properties host_queue_props, device_queue_props;
+    cl_queue_properties queue_props[] = { CL_QUEUE_PROPERTIES, 0, 0 };
+
+    clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES,
+                    sizeof(host_queue_props), &host_queue_props, NULL);
+    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n",
+             (int)host_queue_props);
+    clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES,
+                    sizeof(device_queue_props), &device_queue_props, NULL);
+    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n",
+             (int)device_queue_props);
 
-    /* Since the device IDs are opaque types we check the CL_DEVICE_VENDOR_ID which is unique for identical hardware. */
-    cl_uint otherDevice_vid, deviceID_vid;
-    error = clGetDeviceInfo(otherDevice, CL_DEVICE_VENDOR_ID, sizeof(otherDevice_vid), &otherDevice_vid, NULL );
-    test_error( error, "Unable to get device CL_DEVICE_VENDOR_ID" );
-    error = clGetDeviceInfo(deviceID, CL_DEVICE_VENDOR_ID, sizeof(deviceID_vid), &deviceID_vid, NULL );
-    test_error( error, "Unable to get device CL_DEVICE_VENDOR_ID" );
+    auto version = get_device_cl_version(deviceID);
 
-    if( otherDevice_vid != deviceID_vid )
+    // Are on device queues supported
+    bool on_device_supported =
+        (version >= Version(2, 0) && version < Version(3, 0))
+        || (version >= Version(3, 0) && device_queue_props != 0);
+
+    int num_test_options = MIN_NUM_COMMAND_QUEUE_PROPERTIES;
+    if (host_queue_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
     {
-        log_error( "ERROR: Incorrect device returned for queue! (Expected vendor ID 0x%x, got 0x%x)\n", deviceID_vid, otherDevice_vid );
-        return -1;
+        // Test out-of-order queues properties if supported
+        num_test_options = OOO_NUM_COMMAND_QUEUE_PROPERTIES;
+    }
+    if (on_device_supported && !is_compatibility)
+    {
+        // Test queue on device if supported (in this case out-of-order must
+        // also be supported)
+        num_test_options = ARRAY_SIZE(property_options);
     }
 
-    cl_command_queue_properties props;
-    TEST_COMMAND_QUEUE_PARAM( queue, CL_QUEUE_PROPERTIES, props, (unsigned int)( device_props ), "properties", "%d", unsigned int )
+    for (int i = 0; i < num_test_options; i++)
+    {
+        queue_props[1] = property_options[i];
+        clCommandQueueWrapper queue;
 
+        if (is_compatibility)
+        {
+            queue =
+                clCreateCommandQueue(context, deviceID, queue_props[1], &error);
+            test_error(error, "Unable to create command queue to test with");
+        }
+        else
+        {
+            queue = clCreateCommandQueueWithProperties(context, deviceID,
+                                                       &queue_props[0], &error);
+            test_error(error, "Unable to create command queue to test with");
+        }
+
+        cl_uint refCount;
+        error = clGetCommandQueueInfo(queue, CL_QUEUE_REFERENCE_COUNT,
+                                      sizeof(refCount), &refCount, &size);
+        test_error(error, "Unable to get command queue reference count");
+        test_assert_error(size == sizeof(refCount),
+                          "Returned size of command queue reference count does "
+                          "not validate!\n");
+
+        error = command_queue_param_test(queue, CL_QUEUE_CONTEXT, context,
+                                         "context");
+        test_error(error, "param checking failed");
+
+        error = command_queue_param_test(queue, CL_QUEUE_DEVICE, deviceID,
+                                         "deviceID");
+        test_error(error, "param checking failed");
+
+        error = command_queue_param_test(queue, CL_QUEUE_PROPERTIES,
+                                         queue_props[1], "properties");
+        test_error(error, "param checking failed");
+    }
+    return 0;
+}
+int test_get_command_queue_info(cl_device_id deviceID, cl_context context,
+                                cl_command_queue ignoreQueue, int num_elements)
+{
+    int error = check_get_command_queue_info_params(deviceID, context, false);
+    test_error(error, "Test Failed");
+    return 0;
+}
+
+int test_get_command_queue_info_compatibility(cl_device_id deviceID,
+                                              cl_context context,
+                                              cl_command_queue ignoreQueue,
+                                              int num_elements)
+{
+    int error = check_get_command_queue_info_params(deviceID, context, true);
+    test_error(error, "Test Failed");
     return 0;
 }
 
diff --git a/test_conformance/api/test_queries_compatibility.cpp b/test_conformance/api/test_queries_compatibility.cpp
deleted file mode 100644
index c53fba894d..0000000000
--- a/test_conformance/api/test_queries_compatibility.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "testBase.h"
-#include "harness/imageHelpers.h"
-#include <stdlib.h>
-#include <ctype.h>
-
-int test_get_sampler_info_compatibility(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error;
-    size_t size;
-
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
-
-    clSamplerWrapper sampler = clCreateSampler( context, CL_TRUE, CL_ADDRESS_CLAMP, CL_FILTER_LINEAR, &error );
-    test_error( error, "Unable to create sampler to test with" );
-
-    cl_uint refCount;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
-    test_error( error, "Unable to get sampler ref count" );
-    if( size != sizeof( refCount ) )
-    {
-        log_error( "ERROR: Returned size of sampler refcount does not validate! (expected %d, got %d)\n", (int)sizeof( refCount ), (int)size );
-        return -1;
-    }
-
-    cl_context otherCtx;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_CONTEXT, sizeof( otherCtx ), &otherCtx, &size );
-    test_error( error, "Unable to get sampler context" );
-    if( otherCtx != context )
-    {
-        log_error( "ERROR: Sampler context does not validate! (expected %p, got %p)\n", context, otherCtx );
-        return -1;
-    }
-    if( size != sizeof( otherCtx ) )
-    {
-        log_error( "ERROR: Returned size of sampler context does not validate! (expected %d, got %d)\n", (int)sizeof( otherCtx ), (int)size );
-        return -1;
-    }
-
-    cl_addressing_mode mode;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_ADDRESSING_MODE, sizeof( mode ), &mode, &size );
-    test_error( error, "Unable to get sampler addressing mode" );
-    if( mode != CL_ADDRESS_CLAMP )
-    {
-        log_error( "ERROR: Sampler addressing mode does not validate! (expected %d, got %d)\n", (int)CL_ADDRESS_CLAMP, (int)mode );
-        return -1;
-    }
-    if( size != sizeof( mode ) )
-    {
-        log_error( "ERROR: Returned size of sampler addressing mode does not validate! (expected %d, got %d)\n", (int)sizeof( mode ), (int)size );
-        return -1;
-    }
-
-    cl_filter_mode fmode;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_FILTER_MODE, sizeof( fmode ), &fmode, &size );
-    test_error( error, "Unable to get sampler filter mode" );
-    if( fmode != CL_FILTER_LINEAR )
-    {
-        log_error( "ERROR: Sampler filter mode does not validate! (expected %d, got %d)\n", (int)CL_FILTER_LINEAR, (int)fmode );
-        return -1;
-    }
-    if( size != sizeof( fmode ) )
-    {
-        log_error( "ERROR: Returned size of sampler filter mode does not validate! (expected %d, got %d)\n", (int)sizeof( fmode ), (int)size );
-        return -1;
-    }
-
-    cl_int norm;
-    error = clGetSamplerInfo( sampler, CL_SAMPLER_NORMALIZED_COORDS, sizeof( norm ), &norm, &size );
-    test_error( error, "Unable to get sampler normalized flag" );
-    if( norm != CL_TRUE )
-    {
-        log_error( "ERROR: Sampler normalized flag does not validate! (expected %d, got %d)\n", (int)CL_TRUE, (int)norm );
-        return -1;
-    }
-    if( size != sizeof( norm ) )
-    {
-        log_error( "ERROR: Returned size of sampler normalized flag does not validate! (expected %d, got %d)\n", (int)sizeof( norm ), (int)size );
-        return -1;
-    }
-
-    return 0;
-}
-
-#define TEST_COMMAND_QUEUE_PARAM( queue, paramName, val, expected, name, type, cast )    \
-error = clGetCommandQueueInfo( queue, paramName, sizeof( val ), &val, &size );        \
-test_error( error, "Unable to get command queue " name );                            \
-if( val != expected )                                                                \
-{                                                                                    \
-log_error( "ERROR: Command queue " name " did not validate! (expected " type ", got " type ")\n", (cast)expected, (cast)val );    \
-return -1;                                                                        \
-}            \
-if( size != sizeof( val ) )                \
-{                                        \
-log_error( "ERROR: Returned size of command queue " name " does not validate! (expected %d, got %d)\n", (int)sizeof( val ), (int)size );    \
-return -1;    \
-}
-
-int test_get_command_queue_info_compatibility(cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements)
-{
-    int error;
-    size_t size;
-
-    cl_command_queue_properties device_props;
-    clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_PROPERTIES, sizeof(device_props), &device_props, NULL);
-    log_info("CL_DEVICE_QUEUE_PROPERTIES is %d\n", (int)device_props);
-
-    // Mask off vendor extension properties.  Only test standard OpenCL
-    // properties
-    device_props &=
-        CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
-
-    clCommandQueueWrapper queue = clCreateCommandQueue( context, deviceID, device_props, &error );
-    test_error( error, "Unable to create command queue to test with" );
-
-    cl_uint refCount;
-    error = clGetCommandQueueInfo( queue, CL_QUEUE_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
-    test_error( error, "Unable to get command queue reference count" );
-    if( size != sizeof( refCount ) )
-    {
-        log_error( "ERROR: Returned size of command queue reference count does not validate! (expected %d, got %d)\n", (int)sizeof( refCount ), (int)size );
-        return -1;
-    }
-
-    cl_context otherCtx;
-    TEST_COMMAND_QUEUE_PARAM( queue, CL_QUEUE_CONTEXT, otherCtx, context, "context", "%p", cl_context )
-
-    cl_device_id otherDevice;
-    error = clGetCommandQueueInfo( queue, CL_QUEUE_DEVICE, sizeof(otherDevice), &otherDevice, &size);
-    test_error(error, "clGetCommandQueue failed.");
-
-    if (size != sizeof(cl_device_id)) {
-        log_error( " ERROR: Returned size of command queue CL_QUEUE_DEVICE does not validate! (expected %d, got %d)\n", (int)sizeof( otherDevice ), (int)size );
-        return -1;
-    }
-
-    /* Since the device IDs are opaque types we check the CL_DEVICE_VENDOR_ID which is unique for identical hardware. */
-    cl_uint otherDevice_vid, deviceID_vid;
-    error = clGetDeviceInfo(otherDevice, CL_DEVICE_VENDOR_ID, sizeof(otherDevice_vid), &otherDevice_vid, NULL );
-    test_error( error, "Unable to get device CL_DEVICE_VENDOR_ID" );
-    error = clGetDeviceInfo(deviceID, CL_DEVICE_VENDOR_ID, sizeof(deviceID_vid), &deviceID_vid, NULL );
-    test_error( error, "Unable to get device CL_DEVICE_VENDOR_ID" );
-
-    if( otherDevice_vid != deviceID_vid )
-    {
-        log_error( "ERROR: Incorrect device returned for queue! (Expected vendor ID 0x%x, got 0x%x)\n", deviceID_vid, otherDevice_vid );
-        return -1;
-    }
-
-    cl_command_queue_properties props;
-    TEST_COMMAND_QUEUE_PARAM( queue, CL_QUEUE_PROPERTIES, props, (unsigned int)( device_props ), "properties", "%d", unsigned int )
-
-    return 0;
-}
-

From 5f869e1c9836aea3af86f4490b6fc7c84ccf54e2 Mon Sep 17 00:00:00 2001
From: Chetan Mistry <70694498+chemis01@users.noreply.github.com>
Date: Mon, 4 Jan 2021 15:20:19 +0000
Subject: [PATCH 004/158] Move TEST_SKIPPED_ITSELF to test_status in
 testHarness (#1089)

TEST_SKIPPED_ITSELF was originally located in
threadTesting.h but this no longer makes sense.
This change moves the definition to the test_status
struct in testHarness so that it can be used in the same
way that test_status' can be used.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>
---
 test_common/harness/testHarness.cpp | 2 ++
 test_common/harness/testHarness.h   | 1 +
 test_common/harness/threadTesting.h | 2 --
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index b2516331b7..48dd482932 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -480,6 +480,7 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum,
                 case TEST_PASS: break;
                 case TEST_FAIL: return fail_init_info(testNum);
                 case TEST_SKIP: return skip_init_info(testNum);
+                case TEST_SKIPPED_ITSELF: return skip_init_info(testNum);
             }
         }
     }
@@ -493,6 +494,7 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum,
             case TEST_PASS: break;
             case TEST_FAIL: return fail_init_info(testNum);
             case TEST_SKIP: return skip_init_info(testNum);
+            case TEST_SKIPPED_ITSELF: return skip_init_info(testNum);
         }
     }
 
diff --git a/test_common/harness/testHarness.h b/test_common/harness/testHarness.h
index d681616a55..331555b2b2 100644
--- a/test_common/harness/testHarness.h
+++ b/test_common/harness/testHarness.h
@@ -80,6 +80,7 @@ typedef enum test_status
     TEST_PASS = 0,
     TEST_FAIL = 1,
     TEST_SKIP = 2,
+    TEST_SKIPPED_ITSELF = -100,
 } test_status;
 
 extern int gFailCount;
diff --git a/test_common/harness/threadTesting.h b/test_common/harness/threadTesting.h
index 91ff279f65..765eabcc61 100644
--- a/test_common/harness/threadTesting.h
+++ b/test_common/harness/threadTesting.h
@@ -22,8 +22,6 @@
 #include <CL/opencl.h>
 #endif
 
-#define TEST_SKIPPED_ITSELF -100
-
 typedef int (*basefn)(cl_device_id deviceID, cl_context context,
                       cl_command_queue queue, int num_elements);
 extern int test_threaded_function(basefn fnToTest, cl_device_id device,

From 42d58be9a3b75fd11398d716f7e13a0fbc7499bd Mon Sep 17 00:00:00 2001
From: ellnor01 <51320439+ellnor01@users.noreply.github.com>
Date: Mon, 4 Jan 2021 15:20:55 +0000
Subject: [PATCH 005/158] Use clang-format-9 binary in Github Actions (#1088)

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>
---
 check-format.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/check-format.sh b/check-format.sh
index 7eae2fdc15..7de2bd2c99 100755
--- a/check-format.sh
+++ b/check-format.sh
@@ -2,12 +2,10 @@
 
 # Arg used to specify non-'origin/master' comparison branch
 ORIGIN_BRANCH=${1:-"origin/master"}
+CLANG_BINARY=${2:-"`which clang-format-9`"}
 
 # Run git-clang-format to check for violations
-if [ "$TRAVIS" == "true" ]; then
-    EXTRA_OPTS="--binary `which clang-format-9`"
-fi
-CLANG_FORMAT_OUTPUT=$(git-clang-format --diff $ORIGIN_BRANCH --extensions c,cpp,h,hpp $EXTRA_OPTS)
+CLANG_FORMAT_OUTPUT=$(git-clang-format --diff $ORIGIN_BRANCH --extensions c,cpp,h,hpp --binary $CLANG_BINARY)
 
 # Check for no-ops
 grep '^no modified files to format$' <<<"$CLANG_FORMAT_OUTPUT" && exit 0

From 1cd9d084d5f426fd4746b73f23c95a8f8ea79c15 Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Mon, 4 Jan 2021 11:12:44 -0500
Subject: [PATCH 006/158] Fix implicit int->float warning inside kernel (#1093)

* Fix implicit int->float warning inside kernel

This kernel is used to test various compiler options including
-Werror. Some compilers produce a warning about the implicit
conversion which results in this test failing when -Werror is used.

* Fix formatting
---
 test_conformance/compiler/test_build_options.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test_conformance/compiler/test_build_options.cpp b/test_conformance/compiler/test_build_options.cpp
index 7ab4454f5f..5bd9411523 100644
--- a/test_conformance/compiler/test_build_options.cpp
+++ b/test_conformance/compiler/test_build_options.cpp
@@ -43,11 +43,12 @@ const char *include_test_kernel[] = {
 "}\n" };
 
 const char *options_test_kernel[] = {
-"__kernel void sample_test(__global float *src, __global int *dst)\n"
-"{\n"
-"    size_t tid = get_global_id(0);\n"
-"    dst[tid] = src[tid];\n"
-"}\n" };
+    "__kernel void sample_test(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    size_t tid = get_global_id(0);\n"
+    "    dst[tid] = (int)src[tid];\n"
+    "}\n"
+};
 
 const char *optimization_options[] = {
     "-cl-single-precision-constant",

From 85bae70f811ff1828b9abca9575a45c3ff47b667 Mon Sep 17 00:00:00 2001
From: Chetan Mistry <70694498+chemis01@users.noreply.github.com>
Date: Mon, 4 Jan 2021 16:13:06 +0000
Subject: [PATCH 007/158] Fix Context Leak in api:clone_kernel (#1090)

In this test we repeated call create_single_kernel_helper
to create different kernels but reuse the same clProgramWrapper.
create_single_kernel_helper() creates a new program each time it
is called, creating a new reference on the underlying context.
When the test ends, the program is released (as it should when
using the clProgramWrapper), however there are multiple program
objects attached to the context resulting in reference leak errors.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>
---
 test_conformance/api/test_clone_kernel.cpp | 27 ++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/test_conformance/api/test_clone_kernel.cpp b/test_conformance/api/test_clone_kernel.cpp
index 1f2278165a..1a7e67a678 100644
--- a/test_conformance/api/test_clone_kernel.cpp
+++ b/test_conformance/api/test_clone_kernel.cpp
@@ -113,15 +113,16 @@ int test_image_arg_shallow_clone(cl_device_id deviceID, cl_context context, cl_c
     clSamplerWrapper sampler;
     img_format.image_channel_order = CL_RGBA;
     img_format.image_channel_data_type = CL_UNSIGNED_INT8;
-	cl_image_desc imageDesc;
-	memset(&imageDesc, 0x0, sizeof(cl_image_desc));
+    cl_image_desc imageDesc;
+    memset(&imageDesc, 0x0, sizeof(cl_image_desc));
     imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
     imageDesc.image_width = 512;
     imageDesc.image_height = 512;
 
     cl_uint color[4] = {1,3,5,7};
 
-    clProgramWrapper program;
+    clProgramWrapper program_read;
+    clProgramWrapper program_write;
     clKernelWrapper kernel_read;
     clKernelWrapper kernel_write;
     clKernelWrapper kernel_cloned;
@@ -129,12 +130,16 @@ int test_image_arg_shallow_clone(cl_device_id deviceID, cl_context context, cl_c
 
     clMemWrapper img;
 
-    if( create_single_kernel_helper( context, &program, &kernel_read, 1, clone_kernel_test_img, "img_read_kernel" ) != 0 )
+    if (create_single_kernel_helper(context, &program_read, &kernel_read, 1,
+                                    clone_kernel_test_img, "img_read_kernel")
+        != 0)
     {
         return -1;
     }
 
-    if( create_single_kernel_helper( context, &program, &kernel_write, 1, clone_kernel_test_img, "img_write_kernel" ) != 0 )
+    if (create_single_kernel_helper(context, &program_write, &kernel_write, 1,
+                                    clone_kernel_test_img, "img_write_kernel")
+        != 0)
     {
         return -1;
     }
@@ -241,6 +246,8 @@ int test_clone_kernel(cl_device_id deviceID, cl_context context, cl_command_queu
 {
     int error;
     clProgramWrapper program;
+    clProgramWrapper program_buf_read;
+    clProgramWrapper program_buf_write;
     clKernelWrapper kernel;
     clKernelWrapper kernel_pipe_read;
     clKernelWrapper kernel_buf_read;
@@ -272,12 +279,18 @@ int test_clone_kernel(cl_device_id deviceID, cl_context context, cl_command_queu
         return -1;
     }
 
-    if( create_single_kernel_helper( context, &program, &kernel_buf_read, 1, clone_kernel_test_kernel, "buf_read_kernel" ) != 0 )
+    if (create_single_kernel_helper(context, &program_buf_read,
+                                    &kernel_buf_read, 1,
+                                    clone_kernel_test_kernel, "buf_read_kernel")
+        != 0)
     {
         return -1;
     }
 
-    if( create_single_kernel_helper( context, &program, &kernel_buf_write, 1, clone_kernel_test_kernel, "buf_write_kernel" ) != 0 )
+    if (create_single_kernel_helper(
+            context, &program_buf_write, &kernel_buf_write, 1,
+            clone_kernel_test_kernel, "buf_write_kernel")
+        != 0)
     {
         return -1;
     }

From 25d9ff5d6ec4dd8140b74f093b888984aa7580ee Mon Sep 17 00:00:00 2001
From: ellnor01 <51320439+ellnor01@users.noreply.github.com>
Date: Thu, 7 Jan 2021 11:34:42 +0000
Subject: [PATCH 008/158] Using helper functions for clCreateKernel (#1064)

* Using helper functions for clCreateKernel

Uses of clCreateKernel following create program helper
functions, have been incorporated into
create_single_kernel_helper when suitable.

Contributes #31

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Skip tests using clCompileProgram in offline mode

Contributes #31

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Using type wrappers when using kernel helper functions

Also includes fix for windows build

Fixes #31

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Remove clReleaseKernel for wrapped kernel

Fixes #31

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>
---
 test_common/harness/errorHelpers.cpp          |   2 +
 test_common/harness/imageHelpers.cpp          |  42 +----
 test_conformance/api/test_create_kernels.cpp  |   9 +-
 test_conformance/api/test_null_buffer_arg.cpp |  11 +-
 test_conformance/api/test_retain.cpp          |   8 +-
 test_conformance/api/test_retain_program.cpp  |   9 +-
 test_conformance/d3d10/harness.cpp            |  33 +---
 test_conformance/d3d11/harness.cpp            |  37 +----
 test_conformance/math_brute_force/main.cpp    | 148 ++++++++----------
 test_conformance/printf/test_printf.cpp       |  19 +--
 ...l_khr_spirv_no_integer_wrap_decoration.cpp |  10 +-
 test_conformance/spirv_new/test_op_fmath.cpp  |  10 +-
 .../spirv_new/test_op_vector_times_scalar.cpp |  10 +-
 13 files changed, 114 insertions(+), 234 deletions(-)

diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index c1d0602897..8f3c188309 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -686,6 +686,8 @@ const char *subtests_to_skip_with_offline_compiler[] = {
     "unload_build_info",
     "unload_program_binaries",
     "features_macro",
+    "progvar_prog_scope_misc",
+    "library_function"
 };
 
 int check_functions_for_offline_compiler(const char *subtestname,
diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index 26110a474e..b785f64ded 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -2917,7 +2917,7 @@ int DetectFloatToHalfRoundingMode(
         }
 
         // Create our program, and a kernel
-        const char *kernel[1] = {
+        const char *kernelSource[1] = {
             "kernel void detect_round( global float4 *in, write_only image2d_t "
             "out )\n"
             "{\n"
@@ -2927,8 +2927,9 @@ int DetectFloatToHalfRoundingMode(
         };
 
         clProgramWrapper program;
-        err = create_single_kernel_helper_create_program(context, &program, 1,
-                                                         kernel);
+        clKernelWrapper kernel;
+        err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                          kernelSource, "detect_round");
 
         if (NULL == program || err)
         {
@@ -2953,29 +2954,7 @@ int DetectFloatToHalfRoundingMode(
             return err;
         }
 
-        err = clBuildProgram(program, 1, &device, "", NULL, NULL);
-        if (err)
-        {
-            log_error("Error:  could not build program in "
-                      "DetectFloatToHalfRoundingMode  (%d)",
-                      err);
-            clReleaseMemObject(inBuf);
-            clReleaseMemObject(outImage);
-            return err;
-        }
-
-        cl_kernel k = clCreateKernel(program, "detect_round", &err);
-        if (NULL == k || err)
-        {
-            log_error("Error:  could not create kernel in "
-                      "DetectFloatToHalfRoundingMode  (%d)",
-                      err);
-            clReleaseMemObject(inBuf);
-            clReleaseMemObject(outImage);
-            return err;
-        }
-
-        err = clSetKernelArg(k, 0, sizeof(cl_mem), &inBuf);
+        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inBuf);
         if (err)
         {
             log_error("Error: could not set argument 0 of kernel in "
@@ -2983,11 +2962,10 @@ int DetectFloatToHalfRoundingMode(
                       err);
             clReleaseMemObject(inBuf);
             clReleaseMemObject(outImage);
-            clReleaseKernel(k);
             return err;
         }
 
-        err = clSetKernelArg(k, 1, sizeof(cl_mem), &outImage);
+        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outImage);
         if (err)
         {
             log_error("Error: could not set argument 1 of kernel in "
@@ -2995,14 +2973,13 @@ int DetectFloatToHalfRoundingMode(
                       err);
             clReleaseMemObject(inBuf);
             clReleaseMemObject(outImage);
-            clReleaseKernel(k);
             return err;
         }
 
         // Run the kernel
         size_t global_work_size = count;
-        err = clEnqueueNDRangeKernel(q, k, 1, NULL, &global_work_size, NULL, 0,
-                                     NULL, NULL);
+        err = clEnqueueNDRangeKernel(q, kernel, 1, NULL, &global_work_size,
+                                     NULL, 0, NULL, NULL);
         if (err)
         {
             log_error("Error: could not enqueue kernel in "
@@ -3010,7 +2987,6 @@ int DetectFloatToHalfRoundingMode(
                       err);
             clReleaseMemObject(inBuf);
             clReleaseMemObject(outImage);
-            clReleaseKernel(k);
             return err;
         }
 
@@ -3028,7 +3004,6 @@ int DetectFloatToHalfRoundingMode(
                       err);
             clReleaseMemObject(inBuf);
             clReleaseMemObject(outImage);
-            clReleaseKernel(k);
             return err;
         }
 
@@ -3083,7 +3058,6 @@ int DetectFloatToHalfRoundingMode(
         // clean up
         clReleaseMemObject(inBuf);
         clReleaseMemObject(outImage);
-        clReleaseKernel(k);
         return err;
     }
 
diff --git a/test_conformance/api/test_create_kernels.cpp b/test_conformance/api/test_create_kernels.cpp
index 79e01fdb8f..568e84cb6c 100644
--- a/test_conformance/api/test_create_kernels.cpp
+++ b/test_conformance/api/test_create_kernels.cpp
@@ -525,11 +525,10 @@ int test_repeated_setup_cleanup(cl_device_id deviceID, cl_context context, cl_co
         local_queue = clCreateCommandQueue(local_context, deviceID, 0, &error);
         test_error( error, "clCreateCommandQueue failed");
 
-        error = create_single_kernel_helper(local_context, &local_program, NULL, 1, &repeate_test_kernel, NULL);
-        test_error( error, "Unable to build test program" );
-
-        local_kernel = clCreateKernel(local_program, "test_kernel", &error);
-        test_error( error, "clCreateKernel failed");
+        error = create_single_kernel_helper(
+            local_context, &local_program, &local_kernel, 1,
+            &repeate_test_kernel, "test_kernel");
+        test_error(error, "Unable to create kernel");
 
         local_mem_in = clCreateBuffer(local_context, CL_MEM_READ_ONLY, TEST_SIZE*sizeof(cl_int), NULL, &error);
         test_error( error, "clCreateBuffer failed");
diff --git a/test_conformance/api/test_null_buffer_arg.cpp b/test_conformance/api/test_null_buffer_arg.cpp
index ba43f183ea..d412d4eac6 100644
--- a/test_conformance/api/test_null_buffer_arg.cpp
+++ b/test_conformance/api/test_null_buffer_arg.cpp
@@ -157,14 +157,13 @@ int test_null_buffer_arg(cl_device_id device, cl_context context,
 
     // prep kernel:
     if (gIsEmbedded)
-        status = create_single_kernel_helper(context, &program, NULL, 1, &kernel_string, NULL);
+        status = create_single_kernel_helper(context, &program, &kernel, 1,
+                                             &kernel_string, "test_kernel");
     else
-        status = create_single_kernel_helper(context, &program, NULL, 1, &kernel_string_long, NULL);
+        status = create_single_kernel_helper(
+            context, &program, &kernel, 1, &kernel_string_long, "test_kernel");
 
-    test_error(status, "Unable to build test program");
-
-    kernel = clCreateKernel(program, "test_kernel", &status);
-    test_error(status, "CreateKernel failed.");
+    test_error(status, "Unable to create kernel");
 
     cl_mem dev_src = clCreateBuffer(context, CL_MEM_READ_ONLY, NITEMS*sizeof(cl_float),
         NULL, NULL);
diff --git a/test_conformance/api/test_retain.cpp b/test_conformance/api/test_retain.cpp
index cf065bcdb7..6e66c7dabc 100644
--- a/test_conformance/api/test_retain.cpp
+++ b/test_conformance/api/test_retain.cpp
@@ -251,11 +251,9 @@ int test_retain_mem_object_set_kernel_arg(cl_device_id deviceID, cl_context cont
     err = clSetMemObjectDestructorCallback( buffer, callback, nullptr );
     test_error( err, "Unable to set destructor callback" );
 
-    err = create_single_kernel_helper( context, &program, nullptr, 1, testProgram, nullptr );
-    test_error( err, "Unable to build sample program" );
-
-    kernel = clCreateKernel( program, "sample_test", &err );
-    test_error( err, "Unable to create sample_test kernel" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      testProgram, "sample_test");
+    test_error(err, "Unable to build sample program and sample_test kernel");
 
     err = clSetKernelArg( kernel, 0, sizeof(cl_mem), &buffer );
     test_error( err, "Unable to set kernel argument" );
diff --git a/test_conformance/api/test_retain_program.cpp b/test_conformance/api/test_retain_program.cpp
index aa9c8b36e5..b9fc8b7e24 100644
--- a/test_conformance/api/test_retain_program.cpp
+++ b/test_conformance/api/test_retain_program.cpp
@@ -28,14 +28,11 @@ int test_release_kernel_order(cl_device_id deviceID, cl_context context, cl_comm
     int error;
     const char *testProgram[] = { "__kernel void sample_test(__global int *data){}" };
 
-    /* Create a test program */
-    error = create_single_kernel_helper(context, &program, NULL, 1, testProgram, NULL);
+    /* Create a test program and kernel from it */
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        testProgram, "sample_test");
     test_error( error, "Unable to build sample program to test with" );
 
-    /* And create a kernel from it */
-    kernel = clCreateKernel( program, "sample_test", &error );
-    test_error( error, "Unable to create kernel" );
-
     /* Now try freeing the program first, then the kernel. If refcounts are right, this should work just fine */
     clReleaseProgram( program );
     clReleaseKernel( kernel );
diff --git a/test_conformance/d3d10/harness.cpp b/test_conformance/d3d10/harness.cpp
index ffdfea5a52..93f2281d8b 100644
--- a/test_conformance/d3d10/harness.cpp
+++ b/test_conformance/d3d10/harness.cpp
@@ -367,41 +367,12 @@ cl_int HarnessD3D10_CreateKernelFromSource(
         const char *sourceTexts[] = {source};
         size_t sourceLengths[] = {strlen(source) };
 
-        status = create_single_kernel_helper_create_program(context, &program, 1, &sourceTexts[0]);
+        status = create_single_kernel_helper(context, &program, &kernel, 1,
+                                             &sourceTexts[0], entrypoint);
         TestRequire(
             CL_SUCCESS == status,
             "clCreateProgramWithSource failed");
     }
-    status = clBuildProgram(
-        program,
-        0,
-        NULL,
-        NULL,
-        NULL,
-        NULL);
-    if (CL_SUCCESS != status)
-    {
-        char log[2048] = {0};
-        status = clGetProgramBuildInfo(
-            program,
-            device,
-            CL_PROGRAM_BUILD_LOG,
-            sizeof(log),
-            log,
-            NULL);
-        TestPrint("error: %s\n", log);
-        TestRequire(
-            CL_SUCCESS == status,
-            "Compilation error log:\n%s\n", log);
-    }
-
-    kernel = clCreateKernel(
-        program,
-        entrypoint,
-        &status);
-    TestRequire(
-        CL_SUCCESS == status,
-        "clCreateKernel failed");
 
     clReleaseProgram(program);
     *outKernel = kernel;
diff --git a/test_conformance/d3d11/harness.cpp b/test_conformance/d3d11/harness.cpp
index 687c6da27d..90ba200b1c 100644
--- a/test_conformance/d3d11/harness.cpp
+++ b/test_conformance/d3d11/harness.cpp
@@ -400,41 +400,10 @@ cl_int HarnessD3D11_CreateKernelFromSource(
         const char *sourceTexts[] = {source};
         size_t sourceLengths[] = {strlen(source) };
 
-        status = create_single_kernel_helper_create_program(context, &program, 1, &sourceTexts[0]);
-        TestRequire(
-            CL_SUCCESS == status,
-            "clCreateProgramWithSource failed");
+        status = create_single_kernel_helper(context, &program, &kernel, 1,
+                                             &sourceTexts[0], entrypoint);
+        TestRequire(CL_SUCCESS == status, "Kernel creation failed");
     }
-    status = clBuildProgram(
-        program,
-        0,
-        NULL,
-        NULL,
-        NULL,
-        NULL);
-    if (CL_SUCCESS != status)
-    {
-        char log[2048] = {0};
-        status = clGetProgramBuildInfo(
-            program,
-            device,
-            CL_PROGRAM_BUILD_LOG,
-            sizeof(log),
-            log,
-            NULL);
-        TestPrint("error: %s\n", log);
-        TestRequire(
-            CL_SUCCESS == status,
-            "Compilation error log:\n%s\n", log);
-    }
-
-    kernel = clCreateKernel(
-        program,
-        entrypoint,
-        &status);
-    TestRequire(
-        CL_SUCCESS == status,
-        "clCreateKernel failed");
 
     clReleaseProgram(program);
     *outKernel = kernel;
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 8f2e0a0c2f..d7f2ebf67d 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -25,6 +25,7 @@
 #include "harness/errorHelpers.h"
 #include "harness/kernelHelpers.h"
 #include "harness/parseParameters.h"
+#include "harness/typeWrappers.h"
 
 #if defined( __APPLE__ )
     #include <sys/sysctl.h>
@@ -1384,36 +1385,36 @@ void _LogBuildError( cl_program p, int line, const char *file )
 int InitILogbConstants( void )
 {
     int error;
-    const char *kernel =
-    "__kernel void GetILogBConstants( __global int *out )\n"
-    "{\n"
-    "   out[0] = FP_ILOGB0;\n"
-    "   out[1] = FP_ILOGBNAN;\n"
-    "}\n";
-
-    cl_program query;
-    error = create_single_kernel_helper(gContext, &query, NULL, 1, &kernel, NULL);
-    if (NULL == query || error)
-    {
-        vlog_error( "Error: Unable to create program to get FP_ILOGB0 and FP_ILOGBNAN for the device. (%d)", error );
-        return error;
-    }
-
-    cl_kernel k = clCreateKernel( query, "GetILogBConstants", &error );
-    if( NULL == k || error)
+    const char *kernelSource =
+        R"(__kernel void GetILogBConstants( __global int *out )
+        {
+            out[0] = FP_ILOGB0;
+            out[1] = FP_ILOGBNAN;
+        })";
+
+    clProgramWrapper query;
+    clKernelWrapper kernel;
+    error = create_single_kernel_helper(gContext, &query, &kernel, 1,
+                                        &kernelSource, "GetILogBConstants");
+    if (error != CL_SUCCESS)
     {
-      vlog_error( "Error: Unable to create kernel to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error );
+        vlog_error("Error: Unable to create kernel to get FP_ILOGB0 and "
+                   "FP_ILOGBNAN for the device. (%d)",
+                   error);
         return error;
     }
 
-    if((error = clSetKernelArg(k, 0, sizeof( gOutBuffer[gMinVectorSizeIndex]), &gOutBuffer[gMinVectorSizeIndex])))
+    if ((error =
+             clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
+                            &gOutBuffer[gMinVectorSizeIndex])))
     {
         vlog_error( "Error: Unable to set kernel arg to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error );
         return error;
     }
 
     size_t dim = 1;
-    if((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, &dim, NULL, 0, NULL, NULL) ))
+    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
+                                        NULL, NULL)))
     {
         vlog_error( "Error: Unable to execute kernel to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error );
         return error;
@@ -1429,45 +1430,43 @@ int InitILogbConstants( void )
     gDeviceILogb0 = data.ilogb0;
     gDeviceILogbNaN = data.ilogbnan;
 
-    clReleaseKernel(k);
-    clReleaseProgram(query);
-
     return 0;
 }
 
 int IsTininessDetectedBeforeRounding( void )
 {
     int error;
-    const char *kernel =
-    "__kernel void IsTininessDetectedBeforeRounding( __global float *out )\n"
-    "{\n"
-    "   volatile float a = 0x1.000002p-126f;\n"
-    "   volatile float b = 0x1.fffffcp-1f;\n"       // product is 0x1.fffffffffff8p-127
-    "   out[0] = a * b;\n"
-    "}\n";
-
-    cl_program query;
-    error = create_single_kernel_helper(gContext, &query, NULL, 1, &kernel, NULL);
+    const char *kernelSource =
+        R"(__kernel void IsTininessDetectedBeforeRounding( __global float *out )
+        {
+           volatile float a = 0x1.000002p-126f;
+           volatile float b = 0x1.fffffcp-1f;
+           out[0] = a * b; // product is 0x1.fffffffffff8p-127
+        })";
+
+    clProgramWrapper query;
+    clKernelWrapper kernel;
+    error =
+        create_single_kernel_helper(gContext, &query, &kernel, 1, &kernelSource,
+                                    "IsTininessDetectedBeforeRounding");
     if (error != CL_SUCCESS) {
-        vlog_error( "Error: Unable to create program to detect how tininess is detected for the device. (%d)", error );
-        return error;
-    }
-
-    cl_kernel k = clCreateKernel( query, "IsTininessDetectedBeforeRounding", &error );
-    if( NULL == k || error)
-    {
-      vlog_error( "Error: Unable to create kernel to detect how tininess is detected  for the device. Err = %d", error );
+        vlog_error("Error: Unable to create kernel to detect how tininess is "
+                   "detected for the device. (%d)",
+                   error);
         return error;
     }
 
-    if((error = clSetKernelArg(k, 0, sizeof( gOutBuffer[gMinVectorSizeIndex]), &gOutBuffer[gMinVectorSizeIndex])))
+    if ((error =
+             clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
+                            &gOutBuffer[gMinVectorSizeIndex])))
     {
         vlog_error( "Error: Unable to set kernel arg to detect how tininess is detected  for the device. Err = %d", error );
         return error;
     }
 
     size_t dim = 1;
-    if((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, &dim, NULL, 0, NULL, NULL) ))
+    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
+                                        NULL, NULL)))
     {
         vlog_error( "Error: Unable to execute kernel to detect how tininess is detected  for the device. Err = %d", error );
         return error;
@@ -1482,9 +1481,6 @@ int IsTininessDetectedBeforeRounding( void )
 
     gCheckTininessBeforeRounding = 0 == (data.f & 0x7fffffff);
 
-    clReleaseKernel(k);
-    clReleaseProgram(query);
-
     return 0;
 }
 
@@ -1505,22 +1501,11 @@ int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k,
       strcat(options, " -cl-fast-relaxed-math");
     }
 
-    error = create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options);
+    error =
+        create_single_kernel_helper(gContext, p, k, count, c, name, options);
     if (error != CL_SUCCESS)
     {
-        vlog_error("\t\tFAILED -- Failed to create program. (%d)\n", error);
-        return error;
-    }
-
-    *k = clCreateKernel( *p, name, &error );
-    if( NULL == *k || error )
-    {
-        char    buffer[2048] = "";
-
-        vlog_error("\t\tFAILED -- clCreateKernel() failed: (%d)\n", error);
-        clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
-        vlog_error("Log: %s\n", buffer);
-        clReleaseProgram( *p );
+        vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
         return error;
     }
 
@@ -1581,36 +1566,36 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
 static int IsInRTZMode( void )
 {
     int error;
-    const char *kernel =
-    "__kernel void GetRoundingMode( __global int *out )\n"
-    "{\n"
-    "   volatile float a = 0x1.0p23f;\n"
-    "   volatile float b = -0x1.0p23f;\n"
-    "   out[0] = (a + 0x1.fffffep-1f == a) && (b - 0x1.fffffep-1f == b);\n"
-    "}\n";
-
-    cl_program query;
-    error = create_single_kernel_helper(gContext, &query, NULL, 1, &kernel, NULL);
+    const char *kernelSource =
+        R"(__kernel void GetRoundingMode( __global int *out )
+        {
+            volatile float a = 0x1.0p23f;
+            volatile float b = -0x1.0p23f;
+            out[0] = (a + 0x1.fffffep-1f == a) && (b - 0x1.fffffep-1f == b);
+        "})";
+
+    clProgramWrapper query;
+    clKernelWrapper kernel;
+    error = create_single_kernel_helper(gContext, &query, &kernel, 1,
+                                        &kernelSource, "GetRoundingMode");
     if (error != CL_SUCCESS) {
-        vlog_error( "Error: Unable to create program to detect RTZ mode for the device. (%d)", error );
-        return error;
-    }
-
-    cl_kernel k = clCreateKernel( query, "GetRoundingMode", &error );
-    if( NULL == k || error)
-    {
-        vlog_error( "Error: Unable to create kernel to gdetect RTZ mode for the device. Err = %d", error );
+        vlog_error("Error: Unable to create kernel to detect RTZ mode for the "
+                   "device. (%d)",
+                   error);
         return error;
     }
 
-    if((error = clSetKernelArg(k, 0, sizeof( gOutBuffer[gMinVectorSizeIndex]), &gOutBuffer[gMinVectorSizeIndex])))
+    if ((error =
+             clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
+                            &gOutBuffer[gMinVectorSizeIndex])))
     {
         vlog_error( "Error: Unable to set kernel arg to detect RTZ mode for the device. Err = %d", error );
         return error;
     }
 
     size_t dim = 1;
-    if((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, &dim, NULL, 0, NULL, NULL) ))
+    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
+                                        NULL, NULL)))
     {
         vlog_error( "Error: Unable to execute kernel to detect RTZ mode for the device. Err = %d", error );
         return error;
@@ -1623,9 +1608,6 @@ static int IsInRTZMode( void )
         return error;
     }
 
-    clReleaseKernel(k);
-    clReleaseProgram(query);
-
     return data.isRTZ;
 }
 
diff --git a/test_conformance/printf/test_printf.cpp b/test_conformance/printf/test_printf.cpp
index b169e6b9bc..2b804e402f 100644
--- a/test_conformance/printf/test_printf.cpp
+++ b/test_conformance/printf/test_printf.cpp
@@ -306,15 +306,22 @@ static cl_program makePrintfProgram(cl_kernel *kernel_ptr, const cl_context cont
 
     if(allTestCase[testId]->_type == TYPE_VECTOR)
     {
-        err = create_single_kernel_helper(context, &program, NULL, sizeof(sourceVec) / sizeof(sourceVec[0]), sourceVec, NULL);
+        err = create_single_kernel_helper(
+            context, &program, kernel_ptr,
+            sizeof(sourceVec) / sizeof(sourceVec[0]), sourceVec, testname);
     }
     else if(allTestCase[testId]->_type == TYPE_ADDRESS_SPACE)
     {
-        err = create_single_kernel_helper(context, &program, NULL, sizeof(sourceAddrSpace) / sizeof(sourceAddrSpace[0]), sourceAddrSpace, NULL);
+        err = create_single_kernel_helper(context, &program, kernel_ptr,
+                                          sizeof(sourceAddrSpace)
+                                              / sizeof(sourceAddrSpace[0]),
+                                          sourceAddrSpace, testname);
     }
     else
     {
-        err = create_single_kernel_helper(context, &program, NULL, sizeof(sourceGen) / sizeof(sourceGen[0]), sourceGen, NULL);
+        err = create_single_kernel_helper(
+            context, &program, kernel_ptr,
+            sizeof(sourceGen) / sizeof(sourceGen[0]), sourceGen, testname);
     }
 
     if (!program || err) {
@@ -322,12 +329,6 @@ static cl_program makePrintfProgram(cl_kernel *kernel_ptr, const cl_context cont
         return NULL;
     }
 
-    *kernel_ptr = clCreateKernel(program, testname, &err);
-    if ( err ) {
-        log_error("clCreateKernel failed (%d)\n", err);
-        return NULL;
-    }
-
     return program;
 }
 
diff --git a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
index 84f8ed1f09..9e1789c27d 100644
--- a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
+++ b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
@@ -129,13 +129,9 @@ int test_ext_cl_khr_spirv_no_integer_wrap_decoration(cl_device_id deviceID,
     {
         // Run the cl kernel for reference results
         clProgramWrapper prog;
-        err = create_single_kernel_helper_create_program(context, &prog, 1, &kernelBuf, NULL);
-        SPIRV_CHECK_ERROR(err, "Failed to create cl program");
-
-        err = clBuildProgram(prog, 1, &deviceID, NULL, NULL, NULL);
-        SPIRV_CHECK_ERROR(err, "Failed to build program");
-
-        clKernelWrapper kernel = clCreateKernel(prog, "fmath_cl", &err);
+        clKernelWrapper kernel;
+        err = create_single_kernel_helper(context, &prog, &kernel, 1,
+                                          &kernelBuf, "fmath_cl");
         SPIRV_CHECK_ERROR(err, "Failed to create cl kernel");
 
         clMemWrapper ref = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);
diff --git a/test_conformance/spirv_new/test_op_fmath.cpp b/test_conformance/spirv_new/test_op_fmath.cpp
index 7250eb159f..bec0667ce6 100644
--- a/test_conformance/spirv_new/test_op_fmath.cpp
+++ b/test_conformance/spirv_new/test_op_fmath.cpp
@@ -89,13 +89,9 @@ int test_fmath(cl_device_id deviceID,
     {
         // Run the cl kernel for reference results
         clProgramWrapper prog;
-        err = create_single_kernel_helper_create_program(context, &prog, 1, &kernelBuf, NULL);
-        SPIRV_CHECK_ERROR(err, "Failed to create cl program");
-
-        err = clBuildProgram(prog, 1, &deviceID, NULL, NULL, NULL);
-        SPIRV_CHECK_ERROR(err, "Failed to build program");
-
-        clKernelWrapper kernel = clCreateKernel(prog, "fmath_cl", &err);
+        clKernelWrapper kernel;
+        err = create_single_kernel_helper(context, &prog, &kernel, 1,
+                                          &kernelBuf, "fmath_cl");
         SPIRV_CHECK_ERROR(err, "Failed to create cl kernel");
 
         clMemWrapper ref = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);
diff --git a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
index 99d71f726a..0a604bcf1a 100644
--- a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
+++ b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
@@ -82,15 +82,11 @@ int test_vector_times_scalar(cl_device_id deviceID,
     {
         // Run the cl kernel for reference results
         clProgramWrapper prog;
-        err = create_single_kernel_helper_create_program(context, &prog, 1, &kernelBuf, NULL);
+        clKernelWrapper kernel;
+        err = create_single_kernel_helper(context, &prog, &kernel, 1,
+                                          &kernelBuf, "vector_times_scalar");
         SPIRV_CHECK_ERROR(err, "Failed to create cl program");
 
-        err = clBuildProgram(prog, 1, &deviceID, NULL, NULL, NULL);
-        SPIRV_CHECK_ERROR(err, "Failed to build program");
-
-        clKernelWrapper kernel = clCreateKernel(prog, "vector_times_scalar", &err);
-        SPIRV_CHECK_ERROR(err, "Failed to create cl kernel");
-
         clMemWrapper ref = clCreateBuffer(context, CL_MEM_READ_WRITE, res_bytes, NULL, &err);
         SPIRV_CHECK_ERROR(err, "Failed to create ref buffer");
 

From 904fb419ee81830d406fc5e022135cbccfaf9f6a Mon Sep 17 00:00:00 2001
From: Jeremy Kemp <jeremy@jeremykemp.co.uk>
Date: Thu, 7 Jan 2021 11:34:58 +0000
Subject: [PATCH 009/158] Restored the embedded reduction factor to bruteforce.
 (#1052)

* Restored the embedded reduction factor to bruteforce.

This change was present on the GitLab branch but missed out during the transition to GitHub.

This change is intentionally as close as possible to the patch on GitLab.

Fixes #1045

* Added helper functions for bruteforce step and scale.

* Added missing files from 1e4d19b.

* Renamed getTestScale and getTestStep to set*.
---
 test_conformance/math_brute_force/Utility.h   | 33 ++++++++++++++++++-
 test_conformance/math_brute_force/binary.cpp  |  8 ++---
 .../math_brute_force/binaryOperator.cpp       |  6 ++--
 .../math_brute_force/binary_i.cpp             |  9 ++---
 .../math_brute_force/binary_two_results_i.cpp | 11 ++-----
 test_conformance/math_brute_force/i_unary.cpp | 13 ++------
 .../math_brute_force/macro_binary.cpp         |  7 ++--
 .../math_brute_force/macro_unary.cpp          |  7 ++--
 test_conformance/math_brute_force/mad.cpp     | 13 ++------
 test_conformance/math_brute_force/ternary.cpp | 12 ++-----
 test_conformance/math_brute_force/unary.cpp   |  8 ++---
 .../math_brute_force/unary_two_results.cpp    | 12 ++-----
 .../math_brute_force/unary_two_results_i.cpp  | 13 ++------
 test_conformance/math_brute_force/unary_u.cpp | 14 +++-----
 14 files changed, 72 insertions(+), 94 deletions(-)

diff --git a/test_conformance/math_brute_force/Utility.h b/test_conformance/math_brute_force/Utility.h
index 31256358b3..92f8f3dc94 100644
--- a/test_conformance/math_brute_force/Utility.h
+++ b/test_conformance/math_brute_force/Utility.h
@@ -31,6 +31,7 @@
 #include "harness/conversions.h"
 
 #define BUFFER_SIZE         (1024*1024*2)
+#define EMBEDDED_REDUCTION_FACTOR (64)
 
 #if defined( __GNUC__ )
     #define UNUSED  __attribute__ ((unused))
@@ -228,6 +229,36 @@ void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int is
 
 float getAllowedUlpError(const Func *f, const bool relaxed);
 
-#endif /* UTILITY_H */
+static inline cl_uint getTestScale(size_t typeSize)
+{
+    if (gWimpyMode)
+    {
+        return (cl_uint)typeSize * 2 * gWimpyReductionFactor;
+    }
+    else if (gIsEmbedded)
+    {
+        return EMBEDDED_REDUCTION_FACTOR;
+    }
+    else
+    {
+        return 1;
+    }
+}
 
+static inline uint64_t getTestStep(size_t typeSize, size_t bufferSize)
+{
+    if (gWimpyMode)
+    {
+        return (1ULL << 32) * gWimpyReductionFactor / (512);
+    }
+    else if (gIsEmbedded)
+    {
+        return (BUFFER_SIZE / typeSize) * EMBEDDED_REDUCTION_FACTOR;
+    }
+    else
+    {
+        return bufferSize / typeSize;
+    }
+}
 
+#endif /* UTILITY_H */
diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index eb5007c09d..0b8be27b6a 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -277,12 +277,12 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = 1;
+    test_info.scale = getTestScale(sizeof(cl_float));
 
     if (gWimpyMode){
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -1014,13 +1014,13 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = 1;
+    test_info.scale = getTestScale(sizeof(cl_double));
 
 
     if (gWimpyMode){
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index 0742964d3b..abcb1b0015 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -269,10 +269,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode) {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
 
     test_info.step = test_info.subBufferSize * test_info.scale;
@@ -963,11 +962,10 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index 6ba0eb58e5..01f45242b8 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -266,12 +266,13 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -780,12 +781,12 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index c5577b9e60..af1b04d1dc 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -287,17 +287,13 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( float );
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
 
 #if defined PARALLEL_REFERENCE
     cl_uint threadCount = GetThreadCount();
 #endif
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if(gWimpyMode ){
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
-
     if( gIsEmbedded )
         float_ulps = f->float_embedded_ulps;
     else
@@ -716,12 +712,9 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( double );
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    if(gWimpyMode ){
-       step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
 
 #if defined PARALLEL_REFERENCE
     cl_uint threadCount = GetThreadCount();
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index 379d8e353f..f6bd1223fe 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -191,14 +191,10 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     cl_kernel kernels[ VECTOR_SIZE_COUNT ];
     int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ;
     size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( float );
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
 
     // This test is not using ThreadPool so we need to disable FTZ here
     // for reference computations
@@ -412,14 +408,11 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     cl_kernel kernels[ VECTOR_SIZE_COUNT ];
     int ftz = f->ftz || gForceFTZ;
     size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( cl_double );
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
     int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
+
     // This test is not using ThreadPool so we need to disable FTZ here
     // for reference computations
     FPU_mode_type oldMode;
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index b590f50a85..1cde215ce3 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -253,12 +253,12 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -765,11 +765,10 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index 872007f156..70f724ceb6 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -224,12 +224,12 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode )
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -623,11 +623,10 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode )
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index 0737afbc20..ed1d7d53fb 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -207,12 +207,8 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( float );
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
 
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
@@ -675,11 +671,8 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    uint64_t step = bufferSize / sizeof( double );
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
+
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index 2c4b503ef2..fd97a95df7 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -228,16 +228,12 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal3 = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
 
-    uint64_t step = bufferSize / sizeof( float );
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
     int skipNanInf = (0 == strcmp( "fma", f->nameInCode )) && ! gInfNanSupport;
     cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
     float float_ulps;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
 
     if( gIsEmbedded )
         float_ulps = f->float_embedded_ulps;
@@ -874,11 +870,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( double );
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 0cde4f3019..8ef33119c8 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -240,12 +240,12 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.threadCount = GetThreadCount();
 
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -1036,12 +1036,12 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     memset( &test_info, 0, sizeof( test_info ) );
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale =  1;
+    test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-        test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index a86277f1d9..b170e09585 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -203,7 +203,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal0 = 0.0f;
     float maxErrorVal1 = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( float );
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
     cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
     int isFract = 0 == strcmp( "fract", f->nameInCode );
@@ -211,10 +211,6 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     float float_ulps = getAllowedUlpError(f, relaxedMode);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
 
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
@@ -666,14 +662,10 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal0 = 0.0f;
     double maxErrorVal1 = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( cl_double );
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
     int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index 108be6a4e5..153268825d 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -209,15 +209,12 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal2 = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
     float float_ulps;
-     uint64_t step = bufferSize / sizeof( float );
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
     cl_ulong  maxiError;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
+
     if( gIsEmbedded )
         float_ulps = f->float_embedded_ulps;
     else
@@ -513,14 +510,10 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     cl_ulong  maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
 
-    uint64_t step = bufferSize / sizeof( double );
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
     int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 87fcae321b..97fd25f96c 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -196,17 +196,14 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
 
-    uint64_t step = bufferSize / sizeof( float );
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
     int isRangeLimited = 0;
     float float_ulps;
     float half_sin_cos_tan_limit = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
+
     if( gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
@@ -473,13 +470,10 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
-    uint64_t step = bufferSize / sizeof( cl_double );
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    if( gWimpyMode )
-    {
-        step = (1ULL<<32) * gWimpyReductionFactor / (512);
-    }
+
     Force64BitFPUPrecision();
 
     // Init the kernels

From 18c54be0a4077e168d593847a1186d23aee4b514 Mon Sep 17 00:00:00 2001
From: ellnor01 <51320439+ellnor01@users.noreply.github.com>
Date: Thu, 7 Jan 2021 11:39:57 +0000
Subject: [PATCH 010/158] Validate CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED
 (#1086)

* Validate CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED

Adding test to validate value returned from clGetDeviceInfo with
CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED.

Fixes #993

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Fixes for formatting on computeinfo add tests

Fixes #993

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>
---
 test_conformance/computeinfo/CMakeLists.txt   |  1 +
 .../computeinfo/conforming_version.cpp        | 37 +++++++++++++++++++
 test_conformance/computeinfo/main.cpp         |  4 ++
 3 files changed, 42 insertions(+)
 create mode 100644 test_conformance/computeinfo/conforming_version.cpp

diff --git a/test_conformance/computeinfo/CMakeLists.txt b/test_conformance/computeinfo/CMakeLists.txt
index 9bdc9e494c..207223a3e3 100644
--- a/test_conformance/computeinfo/CMakeLists.txt
+++ b/test_conformance/computeinfo/CMakeLists.txt
@@ -4,6 +4,7 @@ set(${MODULE_NAME}_SOURCES
         main.cpp
         device_uuid.cpp
         extended_versioning.cpp
+        conforming_version.cpp
 )
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/computeinfo/conforming_version.cpp b/test_conformance/computeinfo/conforming_version.cpp
new file mode 100644
index 0000000000..8c7eb29d50
--- /dev/null
+++ b/test_conformance/computeinfo/conforming_version.cpp
@@ -0,0 +1,37 @@
+
+//
+// Copyright (c) 2020 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <regex>
+#include "harness/testHarness.h"
+#include "harness/deviceInfo.h"
+
+int test_conformance_version(cl_device_id deviceID, cl_context context,
+                             cl_command_queue ignoreQueue, int num_elements)
+{
+    auto version_string{ get_device_info_string(
+        deviceID, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED) };
+
+    // Latest conformance version passed should match vYYYY-MM-DD-XX, where XX
+    // is a number
+    std::regex valid_format("^v\\d{4}-(((0)[1-9])|((1)[0-2]))-((0)[1-9]|[1-2]["
+                            "0-9]|(3)[0-1])-\\d{2}$");
+    test_assert_error(
+        std::regex_match(version_string, valid_format),
+        "CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED does not return "
+        "valid format vYYYY-MM-DD-XX");
+
+    return TEST_PASS;
+}
\ No newline at end of file
diff --git a/test_conformance/computeinfo/main.cpp b/test_conformance/computeinfo/main.cpp
index 47975f862d..4860b44561 100644
--- a/test_conformance/computeinfo/main.cpp
+++ b/test_conformance/computeinfo/main.cpp
@@ -1422,10 +1422,14 @@ extern int test_extended_versioning(cl_device_id, cl_context, cl_command_queue,
                                     int);
 extern int test_device_uuid(cl_device_id, cl_context, cl_command_queue, int);
 
+extern int test_conformance_version(cl_device_id, cl_context, cl_command_queue,
+                                    int);
+
 test_definition test_list[] = {
     ADD_TEST(computeinfo),
     ADD_TEST(extended_versioning),
     ADD_TEST(device_uuid),
+    ADD_TEST_VERSION(conformance_version, Version(3, 0)),
 };
 
 const int test_num = ARRAY_SIZE(test_list);

From 24e6a9125c44ef9fe42603b605a685af9acb4bdf Mon Sep 17 00:00:00 2001
From: james-morrissey-arm <james-morrissey-arm@users.noreply.github.com>
Date: Thu, 7 Jan 2021 12:26:44 +0000
Subject: [PATCH 011/158] Use the right flags when creating images (#328)
 (#1015)

Test was querying for supported images with CL_MEM_WRITE_ONLY flag, but
always used CL_MEM_READ_ONLY to create images.

Fixes issue #328

Signed-off-by: Radek Szymanski <radek.szymanski@arm.com>
Signed-off-by: James Morrissey <james.morrissey@arm.com>

Co-authored-by: Radek Szymanski <radek.szymanski@arm.com>
---
 .../images/clReadWriteImage/test_loops.cpp    | 37 ++++++++++++++-----
 .../images/clReadWriteImage/test_read_1D.cpp  | 33 +++++++++++------
 .../clReadWriteImage/test_read_1D_array.cpp   | 24 ++++++++----
 .../images/clReadWriteImage/test_read_2D.cpp  | 23 ++++++++----
 .../clReadWriteImage/test_read_2D_array.cpp   | 24 ++++++++----
 .../images/clReadWriteImage/test_read_3D.cpp  | 25 +++++++++----
 6 files changed, 115 insertions(+), 51 deletions(-)

diff --git a/test_conformance/images/clReadWriteImage/test_loops.cpp b/test_conformance/images/clReadWriteImage/test_loops.cpp
index f0690e186c..10fb7a7d44 100644
--- a/test_conformance/images/clReadWriteImage/test_loops.cpp
+++ b/test_conformance/images/clReadWriteImage/test_loops.cpp
@@ -16,11 +16,23 @@
 #include "../testBase.h"
 #include "../common.h"
 
-extern int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format );
-extern int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format );
-extern int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format );
-extern int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format );
-extern int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format );
+extern int test_read_image_set_1D(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  cl_image_format *format, cl_mem_flags flags);
+extern int test_read_image_set_2D(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  cl_image_format *format, cl_mem_flags flags);
+extern int test_read_image_set_3D(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  cl_image_format *format, cl_mem_flags flags);
+extern int test_read_image_set_1D_array(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        cl_image_format *format,
+                                        cl_mem_flags flags);
+extern int test_read_image_set_2D_array(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        cl_image_format *format,
+                                        cl_mem_flags flags);
 
 int test_image_type( cl_device_id device, cl_context context, cl_command_queue queue, cl_mem_object_type imageType, cl_mem_flags flags )
 {
@@ -73,19 +85,24 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
 
         switch (imageType) {
             case CL_MEM_OBJECT_IMAGE1D:
-                test_return = test_read_image_set_1D( device, context, queue, &formatList[ i ] );
+                test_return = test_read_image_set_1D(device, context, queue,
+                                                     &formatList[i], flags);
                 break;
             case CL_MEM_OBJECT_IMAGE2D:
-                test_return = test_read_image_set_2D( device, context, queue, &formatList[ i ] );
+                test_return = test_read_image_set_2D(device, context, queue,
+                                                     &formatList[i], flags);
                 break;
             case CL_MEM_OBJECT_IMAGE3D:
-                test_return = test_read_image_set_3D( device,context, queue,  &formatList[ i ] );
+                test_return = test_read_image_set_3D(device, context, queue,
+                                                     &formatList[i], flags);
                 break;
             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-                test_return = test_read_image_set_1D_array( device, context, queue, &formatList[ i ] );
+                test_return = test_read_image_set_1D_array(
+                    device, context, queue, &formatList[i], flags);
                 break;
             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-                test_return = test_read_image_set_2D_array( device, context, queue, &formatList[ i ] );
+                test_return = test_read_image_set_2D_array(
+                    device, context, queue, &formatList[i], flags);
                 break;
         }
 
diff --git a/test_conformance/images/clReadWriteImage/test_read_1D.cpp b/test_conformance/images/clReadWriteImage/test_read_1D.cpp
index 8f996e8ff5..eef5bf4e56 100644
--- a/test_conformance/images/clReadWriteImage/test_read_1D.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_1D.cpp
@@ -15,7 +15,9 @@
 //
 #include "../testBase.h"
 
-int test_read_image_1D( cl_context context, cl_command_queue queue, image_descriptor *imageInfo, MTdata d )
+int test_read_image_1D(cl_context context, cl_command_queue queue,
+                       image_descriptor *imageInfo, MTdata d,
+                       cl_mem_flags flags)
 {
     int error;
 
@@ -34,12 +36,14 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, image_descri
     // Construct testing sources
   if(!gTestMipmaps)
   {
-    image = create_image_1d( context, (cl_mem_flags)(CL_MEM_READ_ONLY), imageInfo->format, imageInfo->width, 0, NULL, NULL, &error );
-    if( image == NULL )
-    {
-      log_error( "ERROR: Unable to create 1D image of size %d (%s)", (int)imageInfo->width, IGetErrorString( error ) );
-      return -1;
-    }
+      image = create_image_1d(context, flags, imageInfo->format,
+                              imageInfo->width, 0, NULL, NULL, &error);
+      if (image == NULL)
+      {
+          log_error("ERROR: Unable to create 1D image of size %d (%s)",
+                    (int)imageInfo->width, IGetErrorString(error));
+          return -1;
+      }
   }
   else
   {
@@ -48,7 +52,8 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, image_descri
     image_desc.image_width = imageInfo->width;
     image_desc.num_mip_levels = imageInfo->num_mip_levels;
 
-    image = clCreateImage( context, CL_MEM_READ_ONLY, imageInfo->format, &image_desc, NULL, &error);
+    image = clCreateImage(context, flags, imageInfo->format, &image_desc, NULL,
+                          &error);
     if( error != CL_SUCCESS )
     {
       log_error( "ERROR: Unable to create %d level mipmapped 1D image of size %d x %d (pitch %d ) (%s)",(int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->rowPitch, IGetErrorString( error ) );
@@ -158,7 +163,9 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, image_descri
     return 0;
 }
 
-int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format )
+int test_read_image_set_1D(cl_device_id device, cl_context context,
+                           cl_command_queue queue, cl_image_format *format,
+                           cl_mem_flags flags)
 {
     size_t maxWidth;
     cl_ulong maxAllocSize, memSize;
@@ -192,7 +199,8 @@ int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_
             if( gDebugTrace )
                 log_info( "   at size %d\n", (int)imageInfo.width );
 
-            int ret = test_read_image_1D( context, queue, &imageInfo, seed );
+            int ret =
+                test_read_image_1D(context, queue, &imageInfo, seed, flags);
             if( ret )
                 return -1;
         }
@@ -216,7 +224,7 @@ int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_
             log_info("Testing %d\n", (int)imageInfo.width);
             if( gDebugTrace )
                 log_info( "   at max size %d\n", (int)maxWidth );
-            if( test_read_image_1D( context, queue, &imageInfo, seed ) )
+            if (test_read_image_1D(context, queue, &imageInfo, seed, flags))
                 return -1;
         }
     }
@@ -252,7 +260,8 @@ int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_
 
             if( gDebugTrace )
                 log_info( "   at size %d (row pitch %d) out of %d\n", (int)imageInfo.width, (int)imageInfo.rowPitch, (int)maxWidth );
-            int ret = test_read_image_1D( context, queue, &imageInfo, seed );
+            int ret =
+                test_read_image_1D(context, queue, &imageInfo, seed, flags);
             if( ret )
                 return -1;
         }
diff --git a/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp b/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp
index ad0444d7ad..5d5c288306 100644
--- a/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp
@@ -15,7 +15,9 @@
 //
 #include "../testBase.h"
 
-int test_read_image_1D_array( cl_context context, cl_command_queue queue, image_descriptor *imageInfo, MTdata d )
+int test_read_image_1D_array(cl_context context, cl_command_queue queue,
+                             image_descriptor *imageInfo, MTdata d,
+                             cl_mem_flags flags)
 {
     int error;
 
@@ -35,7 +37,9 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, image_
     // Construct testing sources
     if(!gTestMipmaps)
     {
-        image = create_image_1d_array( context, (cl_mem_flags)(CL_MEM_READ_ONLY), imageInfo->format, imageInfo->width, imageInfo->arraySize, 0, 0, NULL, &error );
+        image = create_image_1d_array(context, flags, imageInfo->format,
+                                      imageInfo->width, imageInfo->arraySize, 0,
+                                      0, NULL, &error);
         if( image == NULL )
         {
             log_error( "ERROR: Unable to create 1D image array of size %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->arraySize, IGetErrorString( error ) );
@@ -50,7 +54,8 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, image_
         image_desc.image_array_size = imageInfo->arraySize;
         image_desc.num_mip_levels = imageInfo->num_mip_levels;
 
-        image = clCreateImage( context, CL_MEM_READ_ONLY, imageInfo->format, &image_desc, NULL, &error);
+        image = clCreateImage(context, flags, imageInfo->format, &image_desc,
+                              NULL, &error);
         if( error != CL_SUCCESS )
         {
             log_error( "ERROR: Unable to create %d level mipmapped 1D image of width %d and array size %d (pitch %d ) (%s)",(int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->arraySize, (int)imageInfo->rowPitch, IGetErrorString( error ) );
@@ -164,7 +169,9 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, image_
     return 0;
 }
 
-int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format )
+int test_read_image_set_1D_array(cl_device_id device, cl_context context,
+                                 cl_command_queue queue,
+                                 cl_image_format *format, cl_mem_flags flags)
 {
     size_t maxWidth, maxArraySize;
     cl_ulong maxAllocSize, memSize;
@@ -201,7 +208,8 @@ int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_co
                 if( gDebugTrace )
                     log_info( "   at size %d,%d\n", (int)imageInfo.width, (int)imageInfo.arraySize );
 
-                int ret = test_read_image_1D_array( context, queue, &imageInfo, seed );
+                int ret = test_read_image_1D_array(context, queue, &imageInfo,
+                                                   seed, flags);
                 if( ret )
                     return -1;
             }
@@ -228,7 +236,8 @@ int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_co
             log_info("Testing %d x %d\n", (int)imageInfo.width, (int)imageInfo.arraySize);
             if( gDebugTrace )
                 log_info( "   at max size %d,%d\n", (int)maxWidth, (int)maxArraySize );
-            if( test_read_image_1D_array( context, queue, &imageInfo, seed ) )
+            if (test_read_image_1D_array(context, queue, &imageInfo, seed,
+                                         flags))
                 return -1;
         }
     }
@@ -266,7 +275,8 @@ int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_co
 
             if( gDebugTrace )
                 log_info( "   at size %d,%d (row pitch %d) out of %d,%d\n", (int)imageInfo.width, (int)imageInfo.arraySize, (int)imageInfo.rowPitch, (int)maxWidth, (int)maxArraySize );
-            int ret = test_read_image_1D_array( context, queue, &imageInfo, seed );
+            int ret = test_read_image_1D_array(context, queue, &imageInfo, seed,
+                                               flags);
             if( ret )
                 return -1;
         }
diff --git a/test_conformance/images/clReadWriteImage/test_read_2D.cpp b/test_conformance/images/clReadWriteImage/test_read_2D.cpp
index 7c2050350e..fb2e794853 100644
--- a/test_conformance/images/clReadWriteImage/test_read_2D.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_2D.cpp
@@ -15,7 +15,9 @@
 //
 #include "../testBase.h"
 
-int test_read_image_2D( cl_context context, cl_command_queue queue, image_descriptor *imageInfo, MTdata d )
+int test_read_image_2D(cl_context context, cl_command_queue queue,
+                       image_descriptor *imageInfo, MTdata d,
+                       cl_mem_flags flags)
 {
     int error;
 
@@ -35,7 +37,9 @@ int test_read_image_2D( cl_context context, cl_command_queue queue, image_descri
     // Construct testing sources
     if(!gTestMipmaps)
     {
-        image = create_image_2d( context, (cl_mem_flags)(CL_MEM_READ_ONLY), imageInfo->format, imageInfo->width, imageInfo->height, 0, NULL, &error );
+        image =
+            create_image_2d(context, flags, imageInfo->format, imageInfo->width,
+                            imageInfo->height, 0, NULL, &error);
         if( image == NULL )
         {
             log_error( "ERROR: Unable to create 2D image of size %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->height, IGetErrorString( error ) );
@@ -50,7 +54,8 @@ int test_read_image_2D( cl_context context, cl_command_queue queue, image_descri
         image_desc.image_height = imageInfo->height;
         image_desc.num_mip_levels = imageInfo->num_mip_levels;
 
-        image = clCreateImage( context, CL_MEM_READ_ONLY, imageInfo->format, &image_desc, NULL, &error);
+        image = clCreateImage(context, flags, imageInfo->format, &image_desc,
+                              NULL, &error);
         if( error != CL_SUCCESS )
         {
             log_error( "ERROR: Unable to create %d level mipmapped 2D image of size %d x %d (pitch %d ) (%s)",(int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->rowPitch, IGetErrorString( error ) );
@@ -167,7 +172,9 @@ int test_read_image_2D( cl_context context, cl_command_queue queue, image_descri
     return 0;
 }
 
-int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format )
+int test_read_image_set_2D(cl_device_id device, cl_context context,
+                           cl_command_queue queue, cl_image_format *format,
+                           cl_mem_flags flags)
 {
     size_t maxWidth, maxHeight;
     cl_ulong maxAllocSize, memSize;
@@ -203,7 +210,8 @@ int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_
                 if( gDebugTrace )
                     log_info( "   at size %d,%d\n", (int)imageInfo.width, (int)imageInfo.height );
 
-                int ret = test_read_image_2D( context, queue, &imageInfo, seed );
+                int ret =
+                    test_read_image_2D(context, queue, &imageInfo, seed, flags);
                 if( ret )
                     return -1;
             }
@@ -229,7 +237,7 @@ int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_
             log_info("Testing %d x %d\n", (int)imageInfo.width, (int)imageInfo.height);
             if( gDebugTrace )
                 log_info( "   at max size %d,%d\n", (int)maxWidth, (int)maxHeight );
-            if( test_read_image_2D( context, queue, &imageInfo, seed ) )
+            if (test_read_image_2D(context, queue, &imageInfo, seed, flags))
                 return -1;
         }
     }
@@ -265,7 +273,8 @@ int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_
 
             if( gDebugTrace )
                 log_info( "   at size %d,%d (row pitch %d) out of %d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.rowPitch, (int)maxWidth, (int)maxHeight );
-            int ret = test_read_image_2D( context, queue, &imageInfo, seed );
+            int ret =
+                test_read_image_2D(context, queue, &imageInfo, seed, flags);
             if( ret )
                 return -1;
         }
diff --git a/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp b/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp
index 6118e69764..d0113bb749 100644
--- a/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp
@@ -15,7 +15,9 @@
 //
 #include "../testBase.h"
 
-int test_read_image_2D_array( cl_context context, cl_command_queue queue, image_descriptor *imageInfo, MTdata d )
+int test_read_image_2D_array(cl_context context, cl_command_queue queue,
+                             image_descriptor *imageInfo, MTdata d,
+                             cl_mem_flags flags)
 {
     int error;
 
@@ -35,7 +37,9 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, image_
     // Construct testing sources
     if(!gTestMipmaps)
     {
-        image = create_image_2d_array( context, (cl_mem_flags)(CL_MEM_READ_ONLY), imageInfo->format, imageInfo->width, imageInfo->height, imageInfo->arraySize, 0, 0, NULL, &error );
+        image = create_image_2d_array(context, flags, imageInfo->format,
+                                      imageInfo->width, imageInfo->height,
+                                      imageInfo->arraySize, 0, 0, NULL, &error);
         if( image == NULL )
         {
             log_error( "ERROR: Unable to create 2D image array of size %d x %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->arraySize, IGetErrorString( error ) );
@@ -51,7 +55,8 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, image_
         image_desc.image_array_size = imageInfo->arraySize;
         image_desc.num_mip_levels = imageInfo->num_mip_levels;
 
-        image = clCreateImage( context, CL_MEM_READ_ONLY, imageInfo->format, &image_desc, NULL, &error);
+        image = clCreateImage(context, flags, imageInfo->format, &image_desc,
+                              NULL, &error);
         if( error != CL_SUCCESS )
         {
             log_error( "ERROR: Unable to create %d level mipmapped 3D image of size %d x %d x %d (pitch %d, %d ) (%s)",(int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, IGetErrorString( error ) );
@@ -142,7 +147,9 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, image_
     return 0;
 }
 
-int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format )
+int test_read_image_set_2D_array(cl_device_id device, cl_context context,
+                                 cl_command_queue queue,
+                                 cl_image_format *format, cl_mem_flags flags)
 {
     size_t maxWidth, maxHeight, maxArraySize;
     cl_ulong maxAllocSize, memSize;
@@ -181,7 +188,8 @@ int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_co
 
                     if( gDebugTrace )
                         log_info( "   at size %d,%d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.arraySize );
-                    int ret = test_read_image_2D_array( context, queue, &imageInfo, seed );
+                    int ret = test_read_image_2D_array(context, queue,
+                                                       &imageInfo, seed, flags);
                     if( ret )
                         return -1;
                 }
@@ -209,7 +217,8 @@ int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_co
                 imageInfo.num_mip_levels = (cl_uint) random_log_in_range(2, (int)compute_max_mip_levels(imageInfo.width, imageInfo.height, 0), seed);
 
             log_info("Testing %d x %d x %d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.arraySize);
-            if( test_read_image_2D_array( context, queue, &imageInfo, seed ) )
+            if (test_read_image_2D_array(context, queue, &imageInfo, seed,
+                                         flags))
                 return -1;
         }
     }
@@ -253,7 +262,8 @@ int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_co
 
             if( gDebugTrace )
                 log_info( "   at size %d,%d,%d (pitch %d,%d) out of %d,%d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.arraySize, (int)imageInfo.rowPitch, (int)imageInfo.slicePitch, (int)maxWidth, (int)maxHeight, (int)maxArraySize );
-            int ret = test_read_image_2D_array( context, queue, &imageInfo, seed );
+            int ret = test_read_image_2D_array(context, queue, &imageInfo, seed,
+                                               flags);
             if( ret )
                 return -1;
         }
diff --git a/test_conformance/images/clReadWriteImage/test_read_3D.cpp b/test_conformance/images/clReadWriteImage/test_read_3D.cpp
index 8f21ae94e1..2dcd2433d6 100644
--- a/test_conformance/images/clReadWriteImage/test_read_3D.cpp
+++ b/test_conformance/images/clReadWriteImage/test_read_3D.cpp
@@ -15,7 +15,9 @@
 //
 #include "../testBase.h"
 
-int test_read_image_3D( cl_context context, cl_command_queue queue, image_descriptor *imageInfo, MTdata d )
+int test_read_image_3D(cl_context context, cl_command_queue queue,
+                       image_descriptor *imageInfo, MTdata d,
+                       cl_mem_flags flags)
 {
     int error;
 
@@ -34,7 +36,9 @@ int test_read_image_3D( cl_context context, cl_command_queue queue, image_descri
     // Construct testing sources
     if(!gTestMipmaps)
     {
-        image = create_image_3d( context, (cl_mem_flags)(CL_MEM_READ_ONLY), imageInfo->format, imageInfo->width, imageInfo->height, imageInfo->depth, 0, 0, NULL, &error );
+        image = create_image_3d(context, flags, imageInfo->format,
+                                imageInfo->width, imageInfo->height,
+                                imageInfo->depth, 0, 0, NULL, &error);
         if( image == NULL )
         {
             log_error( "ERROR: Unable to create 2D image of size %d x %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, IGetErrorString( error ) );
@@ -50,7 +54,8 @@ int test_read_image_3D( cl_context context, cl_command_queue queue, image_descri
         image_desc.image_depth = imageInfo->depth;
         image_desc.num_mip_levels = imageInfo->num_mip_levels;
 
-        image = clCreateImage( context, CL_MEM_READ_ONLY, imageInfo->format, &image_desc, NULL, &error);
+        image = clCreateImage(context, flags, imageInfo->format, &image_desc,
+                              NULL, &error);
         if( error != CL_SUCCESS )
         {
             log_error( "ERROR: Unable to create %d level mipmapped 3D image of size %d x %d x %d (pitch %d, %d ) (%s)",(int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, IGetErrorString( error ) );
@@ -147,7 +152,9 @@ int test_read_image_3D( cl_context context, cl_command_queue queue, image_descri
     return 0;
 }
 
-int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format )
+int test_read_image_set_3D(cl_device_id device, cl_context context,
+                           cl_command_queue queue, cl_image_format *format,
+                           cl_mem_flags flags)
 {
     size_t maxWidth, maxHeight, maxDepth;
     cl_ulong maxAllocSize, memSize;
@@ -186,7 +193,8 @@ int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_
 
                     if( gDebugTrace )
                         log_info( "   at size %d,%d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.depth );
-                    int ret = test_read_image_3D( context, queue, &imageInfo, seed );
+                    int ret = test_read_image_3D(context, queue, &imageInfo,
+                                                 seed, flags);
                     if( ret )
                         return -1;
                 }
@@ -214,8 +222,8 @@ int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_
         imageInfo.num_mip_levels = (cl_uint) random_log_in_range(2, (int)compute_max_mip_levels(imageInfo.width, imageInfo.height, imageInfo.depth), seed);
 
       log_info("Testing %d x %d x %d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.depth);
-      if( test_read_image_3D( context, queue, &imageInfo, seed ) )
-        return -1;
+      if (test_read_image_3D(context, queue, &imageInfo, seed, flags))
+          return -1;
     }
   }
     else
@@ -257,7 +265,8 @@ int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_
 
             if( gDebugTrace )
                 log_info( "   at size %d,%d,%d (pitch %d,%d) out of %d,%d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.depth, (int)imageInfo.rowPitch, (int)imageInfo.slicePitch, (int)maxWidth, (int)maxHeight, (int)maxDepth );
-            int ret = test_read_image_3D( context, queue, &imageInfo, seed );
+            int ret =
+                test_read_image_3D(context, queue, &imageInfo, seed, flags);
             if( ret )
                 return -1;
         }

From f02cbad2e31e20789526ab214809eb01b0f5d84e Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Sun, 10 Jan 2021 07:55:25 -0500
Subject: [PATCH 012/158] Fix scope of clEventWrapper declarations (#1099)

This change allows us to remove the extra clReleaseEvent calls which
were causing double-free issues.
---
 test_conformance/buffers/test_buffer_fill.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test_conformance/buffers/test_buffer_fill.cpp b/test_conformance/buffers/test_buffer_fill.cpp
index 2a12bd8c35..8e0b93fa97 100644
--- a/test_conformance/buffers/test_buffer_fill.cpp
+++ b/test_conformance/buffers/test_buffer_fill.cpp
@@ -566,7 +566,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
     void        *outptr[5];
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
-    clEventWrapper event[2];
     size_t      ptrSizes[5];
     size_t      global_work_size[3];
     int         err;
@@ -599,6 +598,7 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
 
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
+            clEventWrapper event[2];
 
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
                 buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, hostptr[i], &err);
@@ -699,7 +699,6 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
     TestStruct pattern;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clEventWrapper event[2];
     size_t      ptrSize = sizeof( TestStruct );
     size_t      global_work_size[3];
     int         n, err;
@@ -731,6 +730,8 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
         // Test with random offsets and fill sizes
         for (n = 0; n < 8; n++)
         {
+            clEventWrapper event[2];
+
             offset_elements =
                 (size_t)get_random_float(0.f, (float)(num_elements - 8), d);
             fill_elements = (size_t)get_random_float(
@@ -832,7 +833,6 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
                 free_mtdata(d);
                 return -1;
             }
-            clReleaseEvent( event[0] );
 
             err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL );
             if ( err != CL_SUCCESS ){
@@ -858,7 +858,6 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
             if ( err != CL_SUCCESS ){
                 print_error( err, "clWaitForEvents() failed" );
             }
-            clReleaseEvent( event[1] );
 
             if ( verify_fill_struct( inptr, outptr, num_elements) ) {
                 log_error( " buffer_FILL async struct test failed\n" );

From 2597027737d8c74cbf2a979a3fd585f9f2811929 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Mon, 11 Jan 2021 14:54:19 +0100
Subject: [PATCH 013/158] Use highest OpenCL C version (#1081)

* Set the highest supported OpenCL C version.

* Remove gDeviceLt20 variable - not used anymore.

* Fix formatting issues
---
 test_conformance/SVM/main.cpp                 | 24 ++++++++++---------
 .../api/test_sub_group_dispatch.cpp           |  6 +++--
 .../device_execution/enqueue_profiling.cpp    |  6 ++---
 .../device_execution/host_multi_queue.cpp     |  8 +++++--
 .../device_execution/host_queue_order.cpp     | 10 +++++---
 test_conformance/device_execution/utils.cpp   |  5 ++--
 .../generic_address_space/basic_tests.cpp     |  6 +++--
 .../generic_address_space/stress_tests.cpp    |  6 +++--
 .../images/kernel_image_methods/main.cpp      |  1 -
 .../images/kernel_image_methods/test_1D.cpp   |  4 ++--
 .../kernel_image_methods/test_1D_array.cpp    |  4 ++--
 .../images/kernel_image_methods/test_2D.cpp   |  4 ++--
 .../kernel_image_methods/test_2D_array.cpp    |  4 ++--
 .../kernel_image_methods/test_loops.cpp       |  4 ----
 .../images/kernel_read_write/main.cpp         |  5 ----
 .../kernel_read_write/test_iterations.cpp     |  4 ++--
 .../images/kernel_read_write/test_read_1D.cpp |  4 ++--
 .../kernel_read_write/test_read_1D_array.cpp  |  4 ++--
 .../kernel_read_write/test_read_2D_array.cpp  |  4 ++--
 .../images/kernel_read_write/test_read_3D.cpp |  4 ++--
 .../kernel_read_write/test_write_1D.cpp       |  4 ++--
 .../kernel_read_write/test_write_1D_array.cpp |  4 ++--
 .../kernel_read_write/test_write_2D_array.cpp |  4 ++--
 .../kernel_read_write/test_write_3D.cpp       |  4 ++--
 .../kernel_read_write/test_write_image.cpp    |  4 ++--
 .../images/samplerlessReads/main.cpp          |  1 -
 .../samplerlessReads/test_iterations.cpp      |  4 ++--
 .../images/samplerlessReads/test_loops.cpp    |  5 ----
 .../images/samplerlessReads/test_read_1D.cpp  |  4 ++--
 .../samplerlessReads/test_read_1D_array.cpp   |  4 ++--
 .../samplerlessReads/test_read_1D_buffer.cpp  |  4 ++--
 .../samplerlessReads/test_read_2D_array.cpp   |  4 ++--
 .../images/samplerlessReads/test_read_3D.cpp  |  4 ++--
 test_conformance/pipes/test_pipe_limits.cpp   | 17 ++++++-------
 .../pipes/test_pipe_query_functions.cpp       |  6 +++--
 .../pipes/test_pipe_read_write.cpp            | 10 ++++----
 .../pipes/test_pipe_readwrite_errors.cpp      |  6 +++--
 .../pipes/test_pipe_subgroups.cpp             |  6 +++--
 test_conformance/subgroups/subhelpers.h       |  4 ++--
 test_conformance/subgroups/test_queries.cpp   |  5 ++--
 test_conformance/subgroups/test_workitem.cpp  |  5 ++--
 test_conformance/workgroups/test_wg_all.cpp   |  5 ++--
 test_conformance/workgroups/test_wg_any.cpp   |  5 ++--
 .../workgroups/test_wg_broadcast.cpp          | 14 +++++++----
 .../workgroups/test_wg_reduce.cpp             | 18 ++++++++++----
 .../workgroups/test_wg_reduce_max.cpp         | 18 ++++++++++----
 .../workgroups/test_wg_reduce_min.cpp         | 18 ++++++++++----
 .../workgroups/test_wg_scan_exclusive_add.cpp | 18 ++++++++++----
 .../workgroups/test_wg_scan_exclusive_max.cpp | 18 ++++++++++----
 .../workgroups/test_wg_scan_exclusive_min.cpp | 18 ++++++++++----
 .../workgroups/test_wg_scan_inclusive_add.cpp | 18 ++++++++++----
 .../workgroups/test_wg_scan_inclusive_max.cpp | 18 ++++++++++----
 .../workgroups/test_wg_scan_inclusive_min.cpp | 18 ++++++++++----
 53 files changed, 247 insertions(+), 165 deletions(-)

diff --git a/test_conformance/SVM/main.cpp b/test_conformance/SVM/main.cpp
index 0a052f0503..56fb24f1a1 100644
--- a/test_conformance/SVM/main.cpp
+++ b/test_conformance/SVM/main.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -213,14 +213,15 @@ cl_int create_cl_objects(cl_device_id device_from_harness, const char** ppCodeSt
       return -1;
     }
     bool extensions_supported = true;
-    for (auto extension : extensions_list) 
+    for (auto extension : extensions_list)
     {
-      if (!is_extension_available(devices[i], extension.c_str())) 
-      {
-        log_error("Required extension not found - device id %d - %s\n", i, extension.c_str());
-        extensions_supported = false;
-        break;
-      }
+        if (!is_extension_available(devices[i], extension.c_str()))
+        {
+            log_error("Required extension not found - device id %d - %s\n", i,
+                      extension.c_str());
+            extensions_supported = false;
+            break;
+        }
     }
     if((caps & required_svm_caps) == required_svm_caps && extensions_supported)
     {
@@ -249,10 +250,11 @@ cl_int create_cl_objects(cl_device_id device_from_harness, const char** ppCodeSt
     test_error(error, "clCreateCommandQueue failed");
   }
 
-  if(ppCodeString)
+  if (ppCodeString)
   {
-    error = create_single_kernel_helper(*context, program, 0, 1, ppCodeString, 0, "-cl-std=CL2.0");
-    test_error( error, "failed to create program" );
+      error =
+          create_single_kernel_helper(*context, program, 0, 1, ppCodeString, 0);
+      test_error(error, "failed to create program");
   }
 
   return 0;
diff --git a/test_conformance/api/test_sub_group_dispatch.cpp b/test_conformance/api/test_sub_group_dispatch.cpp
index 387d6c3906..01d0ffa380 100644
--- a/test_conformance/api/test_sub_group_dispatch.cpp
+++ b/test_conformance/api/test_sub_group_dispatch.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -95,7 +95,9 @@ int test_sub_group_dispatch(cl_device_id deviceID, cl_context context, cl_comman
         }
     }
 
-    error = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, subgroup_dispatch_kernel, "subgroup_dispatch_kernel", "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        subgroup_dispatch_kernel,
+                                        "subgroup_dispatch_kernel");
     if (error != 0)
         return error;
 
diff --git a/test_conformance/device_execution/enqueue_profiling.cpp b/test_conformance/device_execution/enqueue_profiling.cpp
index 8e5bab7642..b9e1a179fc 100644
--- a/test_conformance/device_execution/enqueue_profiling.cpp
+++ b/test_conformance/device_execution/enqueue_profiling.cpp
@@ -89,9 +89,9 @@ int test_enqueue_profiling(cl_device_id device, cl_context context,
 
     cl_event kernel_event;
 
-    err_ret = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel, 1, &enqueue_multi_level,
-        "enqueue_multi_level", "-cl-std=CL2.0");
+    err_ret = create_single_kernel_helper(context, &program, &kernel, 1,
+                                          &enqueue_multi_level,
+                                          "enqueue_multi_level");
     if (check_error(err_ret, "Create single kernel failed")) return -1;
 
     res_mem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
diff --git a/test_conformance/device_execution/host_multi_queue.cpp b/test_conformance/device_execution/host_multi_queue.cpp
index e9a675c3f3..661d33deb9 100644
--- a/test_conformance/device_execution/host_multi_queue.cpp
+++ b/test_conformance/device_execution/host_multi_queue.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -184,7 +184,11 @@ int test_host_multi_queue(cl_device_id device, cl_context context, cl_command_qu
                 global = 16;
             }
 
-            err_ret |= create_single_kernel_helper_with_build_options(context, &program[i], &kernel[i], sources_multi_queue_block[i].num_lines, sources_multi_queue_block[i].lines, sources_multi_queue_block[i].kernel_name, "-cl-std=CL2.0");
+            err_ret |= create_single_kernel_helper(
+                context, &program[i], &kernel[i],
+                sources_multi_queue_block[i].num_lines,
+                sources_multi_queue_block[i].lines,
+                sources_multi_queue_block[i].kernel_name);
             if(check_error(err_ret, "Create single kernel failed")) { res = -1; break; }
 
             mem[i] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(kernel_results), kernel_results, &err_ret);
diff --git a/test_conformance/device_execution/host_queue_order.cpp b/test_conformance/device_execution/host_queue_order.cpp
index 5dce16042a..2b5688d126 100644
--- a/test_conformance/device_execution/host_queue_order.cpp
+++ b/test_conformance/device_execution/host_queue_order.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -133,10 +133,14 @@ int test_host_queue_order(cl_device_id device, cl_context context, cl_command_qu
 
     cl_event kernel_event;
 
-    err_ret = create_single_kernel_helper_with_build_options(context, &program1, &kernel1,  arr_size(enqueue_block_first_kernel), enqueue_block_first_kernel, "enqueue_block_first_kernel", "-cl-std=CL2.0");
+    err_ret = create_single_kernel_helper(
+        context, &program1, &kernel1, arr_size(enqueue_block_first_kernel),
+        enqueue_block_first_kernel, "enqueue_block_first_kernel");
     if(check_error(err_ret, "Create single kernel failed")) return -1;
 
-    err_ret = create_single_kernel_helper_with_build_options(context, &program2, &kernel2, arr_size(enqueue_block_second_kernel), enqueue_block_second_kernel, "enqueue_block_second_kernel", "-cl-std=CL2.0");
+    err_ret = create_single_kernel_helper(
+        context, &program2, &kernel2, arr_size(enqueue_block_second_kernel),
+        enqueue_block_second_kernel, "enqueue_block_second_kernel");
     if(check_error(err_ret, "Create single kernel failed")) return -1;
 
     res_mem = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, sizeof(kernel_results), kernel_results, &err_ret);
diff --git a/test_conformance/device_execution/utils.cpp b/test_conformance/device_execution/utils.cpp
index 66a2211f6b..05b6949172 100644
--- a/test_conformance/device_execution/utils.cpp
+++ b/test_conformance/device_execution/utils.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -40,7 +40,8 @@ int run_n_kernel_args(cl_context context, cl_command_queue queue, const char** s
     cl_uint i;
     size_t ret_len;
 
-    err_ret = create_single_kernel_helper_with_build_options(context, &program, &kernel, num_lines, source, kernel_name, "-cl-std=CL2.0");
+    err_ret = create_single_kernel_helper(context, &program, &kernel, num_lines,
+                                          source, kernel_name);
     if(check_error(err_ret, "Create single kernel failed")) return -1;
 
     mem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, res_size, results, &err_ret);
diff --git a/test_conformance/generic_address_space/basic_tests.cpp b/test_conformance/generic_address_space/basic_tests.cpp
index 0b81564dab..b2e745c0fe 100644
--- a/test_conformance/generic_address_space/basic_tests.cpp
+++ b/test_conformance/generic_address_space/basic_tests.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -39,7 +39,9 @@ class CBasicTest : CTest {
 
         const char *srcPtr = src.c_str();
 
-        if (create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, &srcPtr, "testKernel", "-cl-std=CL2.0")) {
+        if (create_single_kernel_helper(context, &program, &kernel, 1, &srcPtr,
+                                        "testKernel"))
+        {
             log_error("create_single_kernel_helper failed");
             return -1;
         }
diff --git a/test_conformance/generic_address_space/stress_tests.cpp b/test_conformance/generic_address_space/stress_tests.cpp
index 4f94a5d098..7193e69236 100644
--- a/test_conformance/generic_address_space/stress_tests.cpp
+++ b/test_conformance/generic_address_space/stress_tests.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -41,7 +41,9 @@ class CStressTest : public CTest {
 
         const char *srcPtr = src.c_str();
 
-        if (create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, &srcPtr, "testKernel", "-cl-std=CL2.0")) {
+        if (create_single_kernel_helper(context, &program, &kernel, 1, &srcPtr,
+                                        "testKernel"))
+        {
             log_error("create_single_kernel_helper failed");
             return -1;
         }
diff --git a/test_conformance/images/kernel_image_methods/main.cpp b/test_conformance/images/kernel_image_methods/main.cpp
index e1320ce398..50653ef55f 100644
--- a/test_conformance/images/kernel_image_methods/main.cpp
+++ b/test_conformance/images/kernel_image_methods/main.cpp
@@ -23,7 +23,6 @@
 bool gDebugTrace;
 bool gTestSmallImages;
 bool gTestMaxImages;
-bool gDeviceLt20 = false;
 
 cl_channel_type gChannelTypeToUse = (cl_channel_type)-1;
 cl_channel_order gChannelOrderToUse = (cl_channel_order)-1;
diff --git a/test_conformance/images/kernel_image_methods/test_1D.cpp b/test_conformance/images/kernel_image_methods/test_1D.cpp
index 1ea8eb8895..0059d4c203 100644
--- a/test_conformance/images/kernel_image_methods/test_1D.cpp
+++ b/test_conformance/images/kernel_image_methods/test_1D.cpp
@@ -15,7 +15,6 @@
 //
 #include "../testBase.h"
 
-extern bool gDeviceLt20;
 
 struct image_kernel_data
 {
@@ -98,7 +97,8 @@ static int test_get_1Dimage_info_single(cl_context context,
     if (error)
         print_error(error, "clFinish failed.\n");
     const char *ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create kernel to test against" );
 
     // Create an output buffer
diff --git a/test_conformance/images/kernel_image_methods/test_1D_array.cpp b/test_conformance/images/kernel_image_methods/test_1D_array.cpp
index 18c190bb18..797161c427 100644
--- a/test_conformance/images/kernel_image_methods/test_1D_array.cpp
+++ b/test_conformance/images/kernel_image_methods/test_1D_array.cpp
@@ -15,7 +15,6 @@
 //
 #include "../testBase.h"
 
-extern bool gDeviceLt20;
 
 struct image_kernel_data
 {
@@ -102,7 +101,8 @@ int test_get_1Dimage_array_info_single(cl_context context,
     if (error)
         print_error(error, "clFinish failed.\n");
     const char *ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create kernel to test against" );
 
     // Create an output buffer
diff --git a/test_conformance/images/kernel_image_methods/test_2D.cpp b/test_conformance/images/kernel_image_methods/test_2D.cpp
index 2ebc546041..b0d4a7086d 100644
--- a/test_conformance/images/kernel_image_methods/test_2D.cpp
+++ b/test_conformance/images/kernel_image_methods/test_2D.cpp
@@ -15,7 +15,6 @@
 //
 #include "../testBase.h"
 
-extern bool gDeviceLt20;
 
 struct image_kernel_data
 {
@@ -133,7 +132,8 @@ int test_get_image_info_single(cl_context context, cl_command_queue queue,
     if (error)
         print_error(error, "clFinish failed.\n");
     const char *ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create kernel to test against" );
 
     // Create an output buffer
diff --git a/test_conformance/images/kernel_image_methods/test_2D_array.cpp b/test_conformance/images/kernel_image_methods/test_2D_array.cpp
index 98c1106249..79248dd530 100644
--- a/test_conformance/images/kernel_image_methods/test_2D_array.cpp
+++ b/test_conformance/images/kernel_image_methods/test_2D_array.cpp
@@ -15,7 +15,6 @@
 //
 #include "../testBase.h"
 
-extern bool gDeviceLt20;
 
 struct image_kernel_data
 {
@@ -108,7 +107,8 @@ int test_get_2Dimage_array_info_single(cl_context context,
     if (error)
         print_error(error, "clFinish failed.\n");
     const char *ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create kernel to test against" );
 
     // Create an output buffer
diff --git a/test_conformance/images/kernel_image_methods/test_loops.cpp b/test_conformance/images/kernel_image_methods/test_loops.cpp
index 8dfebd2fcb..4c7b93e8b9 100644
--- a/test_conformance/images/kernel_image_methods/test_loops.cpp
+++ b/test_conformance/images/kernel_image_methods/test_loops.cpp
@@ -16,7 +16,6 @@
 #include "../testBase.h"
 #include "../common.h"
 
-extern bool gDeviceLt20;
 
 extern int test_get_image_info_1D(cl_device_id device, cl_context context,
                                   cl_command_queue queue,
@@ -117,9 +116,6 @@ int test_image_set( cl_device_id device, cl_context context, cl_command_queue qu
 {
     int version_check;
     auto version = get_device_cl_version(device);
-    if (version < Version(2, 0)) {
-        gDeviceLt20 = true;
-    }
 
     if ((version_check = (version < Version(1, 2))))
     {
diff --git a/test_conformance/images/kernel_read_write/main.cpp b/test_conformance/images/kernel_read_write/main.cpp
index f430c7f57c..31dceb33a7 100644
--- a/test_conformance/images/kernel_read_write/main.cpp
+++ b/test_conformance/images/kernel_read_write/main.cpp
@@ -35,7 +35,6 @@ bool gTestSmallImages;
 bool gTestMaxImages;
 bool gTestImage2DFromBuffer;
 bool gTestMipmaps;
-bool gDeviceLt20 = false;
 cl_filter_mode    gFilterModeToUse = (cl_filter_mode)-1;
 // Default is CL_MEM_USE_HOST_PTR for the test
 cl_mem_flags    gMemFlagsToUse = CL_MEM_USE_HOST_PTR;
@@ -107,10 +106,6 @@ static int doTest( cl_device_id device, cl_context context, cl_command_queue que
     bool            tDisableOffsets = false;
     bool            tNormalizedModeToUse = false;
     cl_filter_mode  tFilterModeToUse = (cl_filter_mode)-1;
-    auto version = get_device_cl_version(device);
-    if (version < Version(2, 0)) {
-        gDeviceLt20 = true;
-    }
 
     if( testTypesToRun & kReadTests )
     {
diff --git a/test_conformance/images/kernel_read_write/test_iterations.cpp b/test_conformance/images/kernel_read_write/test_iterations.cpp
index 06c6c9cfe3..08a4fd28cb 100644
--- a/test_conformance/images/kernel_read_write/test_iterations.cpp
+++ b/test_conformance/images/kernel_read_write/test_iterations.cpp
@@ -26,7 +26,6 @@ extern bool gTestImage2DFromBuffer;
 extern uint64_t gRoundingStartValue;
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
@@ -1664,7 +1663,8 @@ int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_
             gTestMipmaps?", lod":" ");
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     if( gTestSmallImages )
diff --git a/test_conformance/images/kernel_read_write/test_read_1D.cpp b/test_conformance/images/kernel_read_write/test_read_1D.cpp
index 3e3b930d9d..ed387532ea 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D.cpp
@@ -26,7 +26,6 @@
 extern uint64_t gRoundingStartValue;
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 const char *read1DKernelSourcePattern =
 "__kernel void sample_kernel( read_only image1d_t input,%s __global float *xOffsets, __global %s4 *results %s)\n"
@@ -1056,7 +1055,8 @@ int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_
 
     ptr = programSrc;
 
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     if(error)
     {
         exit(1);
diff --git a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
index 44797b1942..1ffd598015 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
@@ -25,7 +25,6 @@
 extern uint64_t gRoundingStartValue;
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 const char *read1DArrayKernelSourcePattern =
 "__kernel void sample_kernel( read_only image1d_array_t input,%s __global float *xOffsets, __global float *yOffsets, __global %s4 *results %s)\n"
@@ -1165,7 +1164,8 @@ int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_co
             gTestMipmaps ? ", lod" : "" );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     if( gTestSmallImages )
diff --git a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
index d424fbdd2a..558b126693 100644
--- a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
@@ -18,7 +18,6 @@
 
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
@@ -1392,7 +1391,8 @@ int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_co
             gTestMipmaps ? ", lod" : " " );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     // Run tests
diff --git a/test_conformance/images/kernel_read_write/test_read_3D.cpp b/test_conformance/images/kernel_read_write/test_read_3D.cpp
index ae8d737de2..0fd777f9fe 100644
--- a/test_conformance/images/kernel_read_write/test_read_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_3D.cpp
@@ -18,7 +18,6 @@
 
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
@@ -1230,7 +1229,8 @@ int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_
             gTestMipmaps? ",lod":" ");
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     // Run tests
diff --git a/test_conformance/images/kernel_read_write/test_write_1D.cpp b/test_conformance/images/kernel_read_write/test_write_1D.cpp
index 68b913e945..bae88b23cc 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D.cpp
@@ -21,7 +21,6 @@
 
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 extern bool validate_float_write_results( float *expected, float *actual, image_descriptor *imageInfo );
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor* imageInfo );
@@ -580,7 +579,8 @@ int test_write_image_1D_set( cl_device_id device, cl_context context, cl_command
              gTestMipmaps ? ", lod" :"" );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     // Run tests
diff --git a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
index 57bdd546b6..6242797aef 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
@@ -21,7 +21,6 @@
 
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 extern bool validate_float_write_results( float *expected, float *actual, image_descriptor *imageInfo );
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo );
@@ -603,7 +602,8 @@ int test_write_image_1D_array_set( cl_device_id device, cl_context context, cl_c
              gTestMipmaps ? ", lod" :"" );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     // Run tests
diff --git a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
index 3de467106f..b0dc1cc3e4 100644
--- a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
@@ -21,7 +21,6 @@
 
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 extern bool validate_float_write_results( float *expected, float *actual, image_descriptor *imageInfo );
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo );
@@ -630,7 +629,8 @@ int test_write_image_2D_array_set( cl_device_id device, cl_context context, cl_c
              gTestMipmaps ? ", lod" : "" );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     // Run tests
diff --git a/test_conformance/images/kernel_read_write/test_write_3D.cpp b/test_conformance/images/kernel_read_write/test_write_3D.cpp
index c6223d8a29..7df29ca489 100644
--- a/test_conformance/images/kernel_read_write/test_write_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_3D.cpp
@@ -21,7 +21,6 @@
 
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 extern bool validate_float_write_results( float *expected, float *actual, image_descriptor *imageInfo );
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo );
@@ -636,7 +635,8 @@ int test_write_image_3D_set( cl_device_id device, cl_context context, cl_command
              gTestMipmaps ? ", lod" : "" );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     // Run tests
diff --git a/test_conformance/images/kernel_read_write/test_write_image.cpp b/test_conformance/images/kernel_read_write/test_write_image.cpp
index e848ab4ffb..2beaf40298 100644
--- a/test_conformance/images/kernel_read_write/test_write_image.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_image.cpp
@@ -22,7 +22,6 @@
 extern bool gTestImage2DFromBuffer;
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
-extern bool gDeviceLt20;
 
 extern int test_write_image_1D_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d );
 extern int test_write_image_3D_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d );
@@ -682,7 +681,8 @@ int test_write_image_set( cl_device_id device, cl_context context, cl_command_qu
              gTestMipmaps ? ", lod" : "" );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     // Run tests
diff --git a/test_conformance/images/samplerlessReads/main.cpp b/test_conformance/images/samplerlessReads/main.cpp
index cb4425924f..cd377793eb 100644
--- a/test_conformance/images/samplerlessReads/main.cpp
+++ b/test_conformance/images/samplerlessReads/main.cpp
@@ -36,7 +36,6 @@ int                 gTypesToTest;
 cl_channel_type     gChannelTypeToUse = (cl_channel_type)-1;
 cl_channel_order    gChannelOrderToUse = (cl_channel_order)-1;
 bool                gEnablePitch = false;
-bool                gDeviceLt20 = false;
 
 static void printUsage( const char *execName );
 
diff --git a/test_conformance/images/samplerlessReads/test_iterations.cpp b/test_conformance/images/samplerlessReads/test_iterations.cpp
index 11b364f929..f6bb42afd4 100644
--- a/test_conformance/images/samplerlessReads/test_iterations.cpp
+++ b/test_conformance/images/samplerlessReads/test_iterations.cpp
@@ -22,7 +22,6 @@
     #include <setjmp.h>
 #endif
 
-extern bool gDeviceLt20;
 extern bool gTestReadWrite;
 
 const char *read2DKernelSourcePattern =
@@ -254,7 +253,8 @@ int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_
     }
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     if ( gTestSmallImages )
diff --git a/test_conformance/images/samplerlessReads/test_loops.cpp b/test_conformance/images/samplerlessReads/test_loops.cpp
index 27840ca70c..b5a956c661 100644
--- a/test_conformance/images/samplerlessReads/test_loops.cpp
+++ b/test_conformance/images/samplerlessReads/test_loops.cpp
@@ -17,7 +17,6 @@
 #include "../common.h"
 
 extern int gTypesToTest;
-extern bool gDeviceLt20;
 extern bool gTestReadWrite;
 
 extern int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler, ExplicitType outputType );
@@ -99,10 +98,6 @@ int test_image_set( cl_device_id device, cl_context context, cl_command_queue qu
     cl_image_format *formatList;
     bool *filterFlags;
     unsigned int numFormats;
-    auto version = get_device_cl_version(device);
-    if (version < Version(2, 0)) {
-        gDeviceLt20 = true;
-    }
 
     if (gTestReadWrite && checkForReadWriteImageSupport(device))
     {
diff --git a/test_conformance/images/samplerlessReads/test_read_1D.cpp b/test_conformance/images/samplerlessReads/test_read_1D.cpp
index d17fdfcf94..a55d2be5d1 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D.cpp
@@ -22,7 +22,6 @@
     #include <setjmp.h>
 #endif
 
-extern bool gDeviceLt20;
 extern bool gTestReadWrite;
 
 const char *read1DKernelSourcePattern =
@@ -252,7 +251,8 @@ int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_
 
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     if ( gTestSmallImages )
diff --git a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
index 6a0e1d535e..b63fed4f0c 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
@@ -22,7 +22,6 @@
     #include <setjmp.h>
 #endif
 
-extern bool gDeviceLt20;
 extern bool gTestReadWrite;
 
 const char *read1DArrayKernelSourcePattern =
@@ -251,7 +250,8 @@ int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_co
 
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     if ( gTestSmallImages )
diff --git a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
index c21b12c828..ee48ec8407 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
@@ -22,7 +22,6 @@
     #include <setjmp.h>
 #endif
 
-extern bool gDeviceLt20;
 
 const char *read1DBufferKernelSourcePattern =
 "__kernel void sample_kernel( read_only image1d_buffer_t inputA, read_only image1d_t inputB, sampler_t sampler, __global int *results )\n"
@@ -244,7 +243,8 @@ int test_read_image_set_1D_buffer( cl_device_id device, cl_context context, cl_c
              readFormat );
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
     if ( gTestSmallImages )
diff --git a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
index cfc12725fe..95a973c401 100644
--- a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
@@ -16,7 +16,6 @@
 #include "../testBase.h"
 #include <float.h>
 
-extern bool gDeviceLt20;
 extern bool gTestReadWrite;
 
 const char *read2DArrayKernelSourcePattern =
@@ -241,7 +240,8 @@ int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_co
 
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
 
diff --git a/test_conformance/images/samplerlessReads/test_read_3D.cpp b/test_conformance/images/samplerlessReads/test_read_3D.cpp
index da5466f77e..39ca71aedc 100644
--- a/test_conformance/images/samplerlessReads/test_read_3D.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_3D.cpp
@@ -16,7 +16,6 @@
 #include "../testBase.h"
 #include <float.h>
 
-extern bool gDeviceLt20;
 extern bool gTestReadWrite;
 
 const char *read3DKernelSourcePattern =
@@ -244,7 +243,8 @@ int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_
 
 
     ptr = programSrc;
-    error = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &ptr, "sample_kernel", gDeviceLt20 ? "" : "-cl-std=CL2.0" );
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                        "sample_kernel");
     test_error( error, "Unable to create testing kernel" );
 
 
diff --git a/test_conformance/pipes/test_pipe_limits.cpp b/test_conformance/pipes/test_pipe_limits.cpp
index 85247f8289..169ab80c35 100644
--- a/test_conformance/pipes/test_pipe_limits.cpp
+++ b/test_conformance/pipes/test_pipe_limits.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -218,9 +218,8 @@ int test_pipe_max_args(cl_device_id deviceID, cl_context context, cl_command_que
     const char *sources[] = { kernel_source.c_str() };
 
     // Create producer kernel
-    err = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel[0], 1, sources, kernelName[0],
-        "-cl-std=CL2.0");
+    err = create_single_kernel_helper(context, &program, &kernel[0], 1, sources,
+                                      kernelName[0]);
     test_error_ret(err, " Error creating program", -1);
 
     //Create consumer kernel
@@ -368,9 +367,8 @@ int test_pipe_max_packet_size(cl_device_id deviceID, cl_context context, cl_comm
     const char *sources[] = { kernel_source.c_str() };
 
     // Create producer kernel
-    err = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel[0], 1, sources, kernelName[0],
-        "-cl-std=CL2.0");
+    err = create_single_kernel_helper(context, &program, &kernel[0], 1, sources,
+                                      kernelName[0]);
     test_error_ret(err, " Error creating program", -1);
 
     //Create consumer kernel
@@ -533,9 +531,8 @@ int test_pipe_max_active_reservations(cl_device_id deviceID, cl_context context,
     const char *sources[] = { kernel_source.c_str() };
 
     // Create producer kernel
-    err = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel[0], 1, sources, kernelName[0],
-        "-cl-std=CL2.0");
+    err = create_single_kernel_helper(context, &program, &kernel[0], 1, sources,
+                                      kernelName[0]);
     test_error_ret(err, " Error creating program", -1);
 
     // Create consumer kernel
diff --git a/test_conformance/pipes/test_pipe_query_functions.cpp b/test_conformance/pipes/test_pipe_query_functions.cpp
index f9c93aa230..21d195053a 100644
--- a/test_conformance/pipes/test_pipe_query_functions.cpp
+++ b/test_conformance/pipes/test_pipe_query_functions.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -140,7 +140,9 @@ int test_pipe_query_functions(cl_device_id deviceID, cl_context context, cl_comm
     test_error_ret(err, " clCreatePipe failed", -1);
 
     // Create producer kernel
-    err = create_single_kernel_helper_with_build_options(context, &program, &kernel[0], 1, (const char**)&pipe_query_functions_kernel_code, kernelName[0], "-cl-std=CL2.0");
+    err = create_single_kernel_helper(
+        context, &program, &kernel[0], 1,
+        (const char **)&pipe_query_functions_kernel_code, kernelName[0]);
     test_error_ret(err, " Error creating program", -1);
 
     //Create consumer kernel
diff --git a/test_conformance/pipes/test_pipe_read_write.cpp b/test_conformance/pipes/test_pipe_read_write.cpp
index 64ee31b3f4..dd0d1216bb 100644
--- a/test_conformance/pipes/test_pipe_read_write.cpp
+++ b/test_conformance/pipes/test_pipe_read_write.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -531,9 +531,8 @@ int test_pipe_readwrite( cl_device_id deviceID, cl_context context, cl_command_q
         std::string kernel_source = sourceCode[i].str();
         const char *sources[] = { kernel_source.c_str() };
         // Create producer kernel
-        err = create_single_kernel_helper_with_build_options(
-            context, &program[i], &kernel[ii], 1, sources, kernelName[ii],
-            "-cl-std=CL2.0");
+        err = create_single_kernel_helper(context, &program[i], &kernel[ii], 1,
+                                          sources, kernelName[ii]);
 
         test_error_ret(err, " Error creating program", -1);
 
@@ -659,7 +658,8 @@ int test_pipe_readwrite_struct_generic( cl_device_id deviceID, cl_context contex
     test_error_ret(err, " clCreatePipe failed", -1);
 
     // Create producer kernel
-    err = create_single_kernel_helper_with_build_options(context, &program, &kernel[0], 1, &kernelCode, kernelName[0], "-cl-std=CL2.0");
+    err = create_single_kernel_helper(context, &program, &kernel[0], 1,
+                                      &kernelCode, kernelName[0]);
     test_error_ret(err, " Error creating program", -1);
 
     //Create consumer kernel
diff --git a/test_conformance/pipes/test_pipe_readwrite_errors.cpp b/test_conformance/pipes/test_pipe_readwrite_errors.cpp
index 1b9fc31388..d4b4524876 100644
--- a/test_conformance/pipes/test_pipe_readwrite_errors.cpp
+++ b/test_conformance/pipes/test_pipe_readwrite_errors.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -115,7 +115,9 @@ int test_pipe_readwrite_errors(cl_device_id deviceID, cl_context context, cl_com
     test_error_ret(err, " clCreatePipe failed", -1);
 
     // Create producer kernel
-    err = create_single_kernel_helper_with_build_options(context, &program, &kernel[0], 1, (const char**)&pipe_readwrite_errors_kernel_code, kernelName[0], "-cl-std=CL2.0");
+    err = create_single_kernel_helper(
+        context, &program, &kernel[0], 1,
+        (const char **)&pipe_readwrite_errors_kernel_code, kernelName[0]);
     test_error_ret(err, " Error creating program", -1);
 
     //Create consumer kernel
diff --git a/test_conformance/pipes/test_pipe_subgroups.cpp b/test_conformance/pipes/test_pipe_subgroups.cpp
index b41170ca08..b3e17183d2 100644
--- a/test_conformance/pipes/test_pipe_subgroups.cpp
+++ b/test_conformance/pipes/test_pipe_subgroups.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -146,7 +146,9 @@ int test_pipe_subgroups_divergence(cl_device_id deviceID, cl_context context, cl
     test_error_ret(err, " clCreatePipe failed", -1);
 
     // Create producer kernel
-    err = create_single_kernel_helper_with_build_options(context, &program, &kernel[0], 1, (const char**)&pipe_subgroups_kernel_code, kernelName[0], "-cl-std=CL2.0");
+    err = create_single_kernel_helper(
+        context, &program, &kernel[0], 1,
+        (const char **)&pipe_subgroups_kernel_code, kernelName[0]);
     test_error_ret(err, " Error creating program", -1);
 
     //Create consumer kernel
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index 6e84ccb37f..dc49af2d25 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -377,8 +377,8 @@ struct test
         const std::string &kernel_str = kernel_sstr.str();
         const char *kernel_src = kernel_str.c_str();
 
-        error = create_single_kernel_helper_with_build_options(
-            context, &program, &kernel, 1, &kernel_src, kname, "-cl-std=CL2.0");
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &kernel_src, kname);
         if (error != 0) return error;
 
         // Determine some local dimensions to use for the test.
diff --git a/test_conformance/subgroups/test_queries.cpp b/test_conformance/subgroups/test_queries.cpp
index 2ad3d7fad7..761ca7a6a1 100644
--- a/test_conformance/subgroups/test_queries.cpp
+++ b/test_conformance/subgroups/test_queries.cpp
@@ -67,9 +67,8 @@ int test_sub_group_info(cl_device_id device, cl_context context,
 
     const std::string &kernel_str = kernel_sstr.str();
     const char *kernel_src = kernel_str.c_str();
-    error = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel, 1, &kernel_src, "query_kernel",
-        "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        &kernel_src, "query_kernel");
     if (error != 0) return error;
 
     // Determine some local dimensions to use for the test.
diff --git a/test_conformance/subgroups/test_workitem.cpp b/test_conformance/subgroups/test_workitem.cpp
index b77bfe1af6..7ffa6a7c39 100644
--- a/test_conformance/subgroups/test_workitem.cpp
+++ b/test_conformance/subgroups/test_workitem.cpp
@@ -227,9 +227,8 @@ int test_work_item_functions(cl_device_id device, cl_context context,
            "}";
     const std::string &kernel_str = kernel_sstr.str();
     const char *kernel_src = kernel_str.c_str();
-    error = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel, 1, &kernel_src, "get_test",
-        "-cl-std=CL2.0");
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        &kernel_src, "get_test");
     if (error != 0) return error;
 
     error = get_max_allowed_work_group_size(context, kernel, &local, NULL);
diff --git a/test_conformance/workgroups/test_wg_all.cpp b/test_conformance/workgroups/test_wg_all.cpp
index 2148fba7bd..ccf17b6e4f 100644
--- a/test_conformance/workgroups/test_wg_all.cpp
+++ b/test_conformance/workgroups/test_wg_all.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -79,7 +79,8 @@ test_work_group_all(cl_device_id device, cl_context context, cl_command_queue qu
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_all_kernel_code, "test_wg_all", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_all_kernel_code, "test_wg_all");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_any.cpp b/test_conformance/workgroups/test_wg_any.cpp
index 35ce0d524f..4785ad5136 100644
--- a/test_conformance/workgroups/test_wg_any.cpp
+++ b/test_conformance/workgroups/test_wg_any.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -79,7 +79,8 @@ test_work_group_any(cl_device_id device, cl_context context, cl_command_queue qu
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_any_kernel_code, "test_wg_any", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_any_kernel_code, "test_wg_any");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_broadcast.cpp b/test_conformance/workgroups/test_wg_broadcast.cpp
index 3da14fb5a9..35559476ba 100644
--- a/test_conformance/workgroups/test_wg_broadcast.cpp
+++ b/test_conformance/workgroups/test_wg_broadcast.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -174,7 +174,9 @@ test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_broadcast_1D_kernel_code, "test_wg_broadcast_1D", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_broadcast_1D_kernel_code,
+                                      "test_wg_broadcast_1D");
     if (err)
         return -1;
 
@@ -281,7 +283,9 @@ test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_broadcast_2D_kernel_code, "test_wg_broadcast_2D", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_broadcast_2D_kernel_code,
+                                      "test_wg_broadcast_2D");
     if (err)
         return -1;
 
@@ -406,7 +410,9 @@ test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_broadcast_3D_kernel_code, "test_wg_broadcast_3D", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_broadcast_3D_kernel_code,
+                                      "test_wg_broadcast_3D");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_reduce.cpp b/test_conformance/workgroups/test_wg_reduce.cpp
index 5da7284acb..eb26f4985d 100644
--- a/test_conformance/workgroups/test_wg_reduce.cpp
+++ b/test_conformance/workgroups/test_wg_reduce.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -176,7 +176,9 @@ test_work_group_reduce_add_int(cl_device_id device, cl_context context, cl_comma
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_add_kernel_code_int, "test_wg_reduce_add_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_add_kernel_code_int,
+                                      "test_wg_reduce_add_int");
     if (err)
         return -1;
 
@@ -279,7 +281,9 @@ test_work_group_reduce_add_uint(cl_device_id device, cl_context context, cl_comm
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_add_kernel_code_uint, "test_wg_reduce_add_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_add_kernel_code_uint,
+                                      "test_wg_reduce_add_uint");
     if (err)
         return -1;
 
@@ -381,7 +385,9 @@ test_work_group_reduce_add_long(cl_device_id device, cl_context context, cl_comm
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_add_kernel_code_long, "test_wg_reduce_add_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_add_kernel_code_long,
+                                      "test_wg_reduce_add_long");
     if (err)
         return -1;
 
@@ -484,7 +490,9 @@ test_work_group_reduce_add_ulong(cl_device_id device, cl_context context, cl_com
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_add_kernel_code_ulong, "test_wg_reduce_add_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_add_kernel_code_ulong,
+                                      "test_wg_reduce_add_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_reduce_max.cpp b/test_conformance/workgroups/test_wg_reduce_max.cpp
index 2464beda00..3bbd3f25bf 100644
--- a/test_conformance/workgroups/test_wg_reduce_max.cpp
+++ b/test_conformance/workgroups/test_wg_reduce_max.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -177,7 +177,9 @@ test_work_group_reduce_max_int(cl_device_id device, cl_context context, cl_comma
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_max_kernel_code_int, "test_wg_reduce_max_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_max_kernel_code_int,
+                                      "test_wg_reduce_max_int");
     if (err)
         return -1;
 
@@ -289,7 +291,9 @@ test_work_group_reduce_max_uint(cl_device_id device, cl_context context, cl_comm
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_max_kernel_code_uint, "test_wg_reduce_max_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_max_kernel_code_uint,
+                                      "test_wg_reduce_max_uint");
     if (err)
         return -1;
 
@@ -400,7 +404,9 @@ test_work_group_reduce_max_long(cl_device_id device, cl_context context, cl_comm
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_max_kernel_code_long, "test_wg_reduce_max_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_max_kernel_code_long,
+                                      "test_wg_reduce_max_long");
     if (err)
         return -1;
 
@@ -512,7 +518,9 @@ test_work_group_reduce_max_ulong(cl_device_id device, cl_context context, cl_com
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_max_kernel_code_ulong, "test_wg_reduce_max_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_max_kernel_code_ulong,
+                                      "test_wg_reduce_max_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_reduce_min.cpp b/test_conformance/workgroups/test_wg_reduce_min.cpp
index f415aa74e2..7b1b22e88c 100644
--- a/test_conformance/workgroups/test_wg_reduce_min.cpp
+++ b/test_conformance/workgroups/test_wg_reduce_min.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -177,7 +177,9 @@ test_work_group_reduce_min_int(cl_device_id device, cl_context context, cl_comma
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_min_kernel_code_int, "test_wg_reduce_min_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_min_kernel_code_int,
+                                      "test_wg_reduce_min_int");
     if (err)
         return -1;
 
@@ -289,7 +291,9 @@ test_work_group_reduce_min_uint(cl_device_id device, cl_context context, cl_comm
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_min_kernel_code_uint, "test_wg_reduce_min_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_min_kernel_code_uint,
+                                      "test_wg_reduce_min_uint");
     if (err)
         return -1;
 
@@ -400,7 +404,9 @@ test_work_group_reduce_min_long(cl_device_id device, cl_context context, cl_comm
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_min_kernel_code_long, "test_wg_reduce_min_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_min_kernel_code_long,
+                                      "test_wg_reduce_min_long");
     if (err)
         return -1;
 
@@ -512,7 +518,9 @@ test_work_group_reduce_min_ulong(cl_device_id device, cl_context context, cl_com
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_reduce_min_kernel_code_ulong, "test_wg_reduce_min_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_reduce_min_kernel_code_ulong,
+                                      "test_wg_reduce_min_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp
index 07eedc1616..e695a16545 100644
--- a/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp
+++ b/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -184,7 +184,9 @@ test_work_group_scan_exclusive_add_int(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_add_kernel_code_int, "test_wg_scan_exclusive_add_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_add_kernel_code_int,
+                                      "test_wg_scan_exclusive_add_int");
     if (err)
         return -1;
 
@@ -287,7 +289,9 @@ test_work_group_scan_exclusive_add_uint(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_add_kernel_code_uint, "test_wg_scan_exclusive_add_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_add_kernel_code_uint,
+                                      "test_wg_scan_exclusive_add_uint");
     if (err)
         return -1;
 
@@ -389,7 +393,9 @@ test_work_group_scan_exclusive_add_long(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_add_kernel_code_long, "test_wg_scan_exclusive_add_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_add_kernel_code_long,
+                                      "test_wg_scan_exclusive_add_long");
     if (err)
         return -1;
 
@@ -492,7 +498,9 @@ test_work_group_scan_exclusive_add_ulong(cl_device_id device, cl_context context
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_add_kernel_code_ulong, "test_wg_scan_exclusive_add_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_add_kernel_code_ulong,
+                                      "test_wg_scan_exclusive_add_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
index d20a31994d..12338b685e 100644
--- a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
+++ b/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -176,7 +176,9 @@ test_work_group_scan_exclusive_max_int(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_max_kernel_code_int, "test_wg_scan_exclusive_max_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_max_kernel_code_int,
+                                      "test_wg_scan_exclusive_max_int");
     if (err)
         return -1;
 
@@ -288,7 +290,9 @@ test_work_group_scan_exclusive_max_uint(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_max_kernel_code_uint, "test_wg_scan_exclusive_max_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_max_kernel_code_uint,
+                                      "test_wg_scan_exclusive_max_uint");
     if (err)
         return -1;
 
@@ -399,7 +403,9 @@ test_work_group_scan_exclusive_max_long(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_max_kernel_code_long, "test_wg_scan_exclusive_max_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_max_kernel_code_long,
+                                      "test_wg_scan_exclusive_max_long");
     if (err)
         return -1;
 
@@ -511,7 +517,9 @@ test_work_group_scan_exclusive_max_ulong(cl_device_id device, cl_context context
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_max_kernel_code_ulong, "test_wg_scan_exclusive_max_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_max_kernel_code_ulong,
+                                      "test_wg_scan_exclusive_max_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
index eb99796059..f4e6bf9772 100644
--- a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
+++ b/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -177,7 +177,9 @@ test_work_group_scan_exclusive_min_int(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_min_kernel_code_int, "test_wg_scan_exclusive_min_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_min_kernel_code_int,
+                                      "test_wg_scan_exclusive_min_int");
     if (err)
         return -1;
 
@@ -289,7 +291,9 @@ test_work_group_scan_exclusive_min_uint(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_min_kernel_code_uint, "test_wg_scan_exclusive_min_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_min_kernel_code_uint,
+                                      "test_wg_scan_exclusive_min_uint");
     if (err)
         return -1;
 
@@ -400,7 +404,9 @@ test_work_group_scan_exclusive_min_long(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_min_kernel_code_long, "test_wg_scan_exclusive_min_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_min_kernel_code_long,
+                                      "test_wg_scan_exclusive_min_long");
     if (err)
         return -1;
 
@@ -512,7 +518,9 @@ test_work_group_scan_exclusive_min_ulong(cl_device_id device, cl_context context
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_exclusive_min_kernel_code_ulong, "test_wg_scan_exclusive_min_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_exclusive_min_kernel_code_ulong,
+                                      "test_wg_scan_exclusive_min_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp
index bff0b0f770..51c98a4e7b 100644
--- a/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp
+++ b/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -173,7 +173,9 @@ test_work_group_scan_inclusive_add_int(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_add_kernel_code_int, "test_wg_scan_inclusive_add_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_add_kernel_code_int,
+                                      "test_wg_scan_inclusive_add_int");
     if (err)
         return -1;
 
@@ -276,7 +278,9 @@ test_work_group_scan_inclusive_add_uint(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_add_kernel_code_uint, "test_wg_scan_inclusive_add_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_add_kernel_code_uint,
+                                      "test_wg_scan_inclusive_add_uint");
     if (err)
         return -1;
 
@@ -378,7 +382,9 @@ test_work_group_scan_inclusive_add_long(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_add_kernel_code_long, "test_wg_scan_inclusive_add_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_add_kernel_code_long,
+                                      "test_wg_scan_inclusive_add_long");
     if (err)
         return -1;
 
@@ -481,7 +487,9 @@ test_work_group_scan_inclusive_add_ulong(cl_device_id device, cl_context context
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_add_kernel_code_ulong, "test_wg_scan_inclusive_add_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_add_kernel_code_ulong,
+                                      "test_wg_scan_inclusive_add_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
index c2455e9cf6..44ebf8059e 100644
--- a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
+++ b/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -175,7 +175,9 @@ test_work_group_scan_inclusive_max_int(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_max_kernel_code_int, "test_wg_scan_inclusive_max_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_max_kernel_code_int,
+                                      "test_wg_scan_inclusive_max_int");
     if (err)
         return -1;
 
@@ -278,7 +280,9 @@ test_work_group_scan_inclusive_max_uint(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_max_kernel_code_uint, "test_wg_scan_inclusive_max_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_max_kernel_code_uint,
+                                      "test_wg_scan_inclusive_max_uint");
     if (err)
         return -1;
 
@@ -380,7 +384,9 @@ test_work_group_scan_inclusive_max_long(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_max_kernel_code_long, "test_wg_scan_inclusive_max_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_max_kernel_code_long,
+                                      "test_wg_scan_inclusive_max_long");
     if (err)
         return -1;
 
@@ -483,7 +489,9 @@ test_work_group_scan_inclusive_max_ulong(cl_device_id device, cl_context context
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_max_kernel_code_ulong, "test_wg_scan_inclusive_max_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_max_kernel_code_ulong,
+                                      "test_wg_scan_inclusive_max_ulong");
     if (err)
         return -1;
 
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
index a73c35a6df..f2f05788f5 100644
--- a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
+++ b/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -175,7 +175,9 @@ test_work_group_scan_inclusive_min_int(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_min_kernel_code_int, "test_wg_scan_inclusive_min_int", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_min_kernel_code_int,
+                                      "test_wg_scan_inclusive_min_int");
     if (err)
         return -1;
 
@@ -278,7 +280,9 @@ test_work_group_scan_inclusive_min_uint(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_min_kernel_code_uint, "test_wg_scan_inclusive_min_uint", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_min_kernel_code_uint,
+                                      "test_wg_scan_inclusive_min_uint");
     if (err)
         return -1;
 
@@ -380,7 +384,9 @@ test_work_group_scan_inclusive_min_long(cl_device_id device, cl_context context,
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_min_kernel_code_long, "test_wg_scan_inclusive_min_long", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_min_kernel_code_long,
+                                      "test_wg_scan_inclusive_min_long");
     if (err)
         return -1;
 
@@ -483,7 +489,9 @@ test_work_group_scan_inclusive_min_ulong(cl_device_id device, cl_context context
     int          i;
     MTdata       d;
 
-    err = create_single_kernel_helper_with_build_options( context, &program, &kernel, 1, &wg_scan_inclusive_min_kernel_code_ulong, "test_wg_scan_inclusive_min_ulong", "-cl-std=CL2.0" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      &wg_scan_inclusive_min_kernel_code_ulong,
+                                      "test_wg_scan_inclusive_min_ulong");
     if (err)
         return -1;
 

From 901f5fcb63532877947d764b9938b34a56914435 Mon Sep 17 00:00:00 2001
From: james-morrissey-arm <james-morrissey-arm@users.noreply.github.com>
Date: Mon, 11 Jan 2021 15:02:26 +0000
Subject: [PATCH 014/158] Fix samplerlessReads read_write (#329) (#1016)

The test was trying to create read_write image with read_only formats.
Make it use common image formats for both read_only and read_write
flags when creating images.

Fixes issue #329

Signed-off-by: Radek Szymanski <radek.szymanski@arm.com>
Signed-off-by: James Morrissey <james.morrissey@arm.com>

Co-authored-by: Radek Szymanski <radek.szymanski@arm.com>
---
 .../images/samplerlessReads/test_loops.cpp    | 55 +++++++++++++++++--
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/test_conformance/images/samplerlessReads/test_loops.cpp b/test_conformance/images/samplerlessReads/test_loops.cpp
index b5a956c661..6f5d009b80 100644
--- a/test_conformance/images/samplerlessReads/test_loops.cpp
+++ b/test_conformance/images/samplerlessReads/test_loops.cpp
@@ -96,7 +96,6 @@ int test_image_set( cl_device_id device, cl_context context, cl_command_queue qu
 
     // Grab the list of supported image formats
     cl_image_format *formatList;
-    bool *filterFlags;
     unsigned int numFormats;
 
     if (gTestReadWrite && checkForReadWriteImageSupport(device))
@@ -104,14 +103,58 @@ int test_image_set( cl_device_id device, cl_context context, cl_command_queue qu
         return TEST_SKIPPED_ITSELF;
     }
 
-    // This flag is only for querying the list of supported formats
-    // The flag for creating image will be set explicitly in test functions
-    cl_mem_flags flags = (gTestReadWrite)? CL_MEM_KERNEL_READ_AND_WRITE : CL_MEM_READ_ONLY;
+    cl_image_format *readOnlyFormats;
+    unsigned int numReadOnlyFormats;
 
-    if ( get_format_list( context, imageType, formatList, numFormats, flags ) )
+    if (get_format_list(context, imageType, readOnlyFormats, numReadOnlyFormats,
+                        CL_MEM_READ_ONLY))
         return -1;
 
-    filterFlags = new bool[ numFormats ];
+    if (gTestReadWrite)
+    {
+        cl_image_format *readWriteFormats;
+        unsigned int numReadWriteFormats;
+
+        if (get_format_list(context, imageType, readWriteFormats,
+                            numReadWriteFormats, CL_MEM_KERNEL_READ_AND_WRITE))
+            return -1;
+
+        numFormats = numReadOnlyFormats;
+        formatList = new cl_image_format[numFormats];
+        unsigned int k = 0;
+
+        // Keep only intersecting formats with read only and read write flags
+        for (unsigned int i = 0; i < numReadOnlyFormats; i++)
+        {
+            for (unsigned int j = 0; j < numReadWriteFormats; j++)
+            {
+                if (readOnlyFormats[i].image_channel_data_type
+                        == readWriteFormats[j].image_channel_data_type
+                    && readOnlyFormats[i].image_channel_order
+                        == readWriteFormats[j].image_channel_order)
+                {
+                    formatList[k].image_channel_data_type =
+                        readOnlyFormats[i].image_channel_data_type;
+                    formatList[k].image_channel_order =
+                        readOnlyFormats[i].image_channel_order;
+                    k++;
+                    break;
+                }
+            }
+        }
+
+        numFormats = k;
+
+        delete[] readOnlyFormats;
+        delete[] readWriteFormats;
+    }
+    else
+    {
+        numFormats = numReadOnlyFormats;
+        formatList = readOnlyFormats;
+    }
+
+    bool *filterFlags = new bool[numFormats];
     if ( filterFlags == NULL )
     {
         log_error( "ERROR: Out of memory allocating filter flags list!\n" );

From ffa75c37ce2c65217e2cea2fe473e91c0ef3dc57 Mon Sep 17 00:00:00 2001
From: Anastasia Stulova <38433336+AnastasiaStulova@users.noreply.github.com>
Date: Tue, 12 Jan 2021 10:17:13 +0000
Subject: [PATCH 015/158] Test cl_ext_cxx_for_opencl (#1095)

Add tests for API extension for compilation
of kernels in C++ for OpenCL language.
* Test that -cl-std=CLC++ is accepted and a
  basic kernel with C++ features is compiled.
* Test that API extension reports the same
  language version as __OPENCL_CPP_VERSION__.

This commit also adds a separate folder for extension
tests.

Signed-off-by: Victoria Holodovsky <victoria.holodovsky@arm.com>

Co-authored-by: Victoria Holodovsky <victoria.holodovsky@arm.com>
---
 test_conformance/CMakeLists.txt               |   1 +
 test_conformance/extensions/CMakeLists.txt    |   1 +
 .../cl_ext_cxx_for_opencl/CMakeLists.txt      |   9 ++
 .../cxx_for_opencl_ext.cpp                    | 105 ++++++++++++++++++
 .../cxx_for_opencl_ver.cpp                    | 102 +++++++++++++++++
 .../extensions/cl_ext_cxx_for_opencl/main.cpp |  28 +++++
 .../extensions/cl_ext_cxx_for_opencl/procs.h  |  26 +++++
 7 files changed, 272 insertions(+)
 create mode 100644 test_conformance/extensions/CMakeLists.txt
 create mode 100644 test_conformance/extensions/cl_ext_cxx_for_opencl/CMakeLists.txt
 create mode 100644 test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp
 create mode 100644 test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ver.cpp
 create mode 100644 test_conformance/extensions/cl_ext_cxx_for_opencl/main.cpp
 create mode 100644 test_conformance/extensions/cl_ext_cxx_for_opencl/procs.h

diff --git a/test_conformance/CMakeLists.txt b/test_conformance/CMakeLists.txt
index 6714e234be..b9b87c1d35 100644
--- a/test_conformance/CMakeLists.txt
+++ b/test_conformance/CMakeLists.txt
@@ -21,6 +21,7 @@ if(D3D11_IS_SUPPORTED)
 endif(D3D11_IS_SUPPORTED)
 add_subdirectory( device_partition )
 add_subdirectory( events )
+add_subdirectory( extensions )
 add_subdirectory( geometrics )
 if(GL_IS_SUPPORTED)
    add_subdirectory( gl )
diff --git a/test_conformance/extensions/CMakeLists.txt b/test_conformance/extensions/CMakeLists.txt
new file mode 100644
index 0000000000..c917eb3060
--- /dev/null
+++ b/test_conformance/extensions/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory( cl_ext_cxx_for_opencl )
diff --git a/test_conformance/extensions/cl_ext_cxx_for_opencl/CMakeLists.txt b/test_conformance/extensions/cl_ext_cxx_for_opencl/CMakeLists.txt
new file mode 100644
index 0000000000..fd397c31b6
--- /dev/null
+++ b/test_conformance/extensions/cl_ext_cxx_for_opencl/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(MODULE_NAME CL_EXT_CXX_FOR_OPENCL)
+
+set(${MODULE_NAME}_SOURCES
+    main.cpp
+    cxx_for_opencl_ext.cpp
+    cxx_for_opencl_ver.cpp
+)
+
+include(../../CMakeCommon.txt)
diff --git a/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp b/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp
new file mode 100644
index 0000000000..4b03b54ae1
--- /dev/null
+++ b/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp
@@ -0,0 +1,105 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+
+
+int test_cxx_for_opencl(cl_device_id device, cl_context context,
+                        cl_command_queue queue)
+{
+    cl_int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel1;
+    clKernelWrapper kernel2;
+    clMemWrapper in_buffer;
+    clMemWrapper out_buffer;
+    cl_int value = 7;
+
+    const char *kernel_sstr =
+        R"(
+        __global int x;
+        template<typename T>
+        void execute(T &a, const T &b) {
+            a = b * 2;
+        }
+        __kernel void k1(__global int *p) {
+            execute(x, *p);
+        }
+        __kernel void k2(__global int *p) {
+            execute(*p, x);
+        })";
+
+    error = create_single_kernel_helper_with_build_options(
+        context, &program, &kernel1, 1, &kernel_sstr, "k1", "-cl-std=CLC++",
+        false);
+    test_error(error, "Failed to create k1 kernel");
+
+    kernel2 = clCreateKernel(program, "k2", &error);
+    test_error(error, "Failed to create k2 kernel");
+
+    in_buffer =
+        clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                       sizeof(value), &value, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    out_buffer =
+        clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                       sizeof(value), &value, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    error = clSetKernelArg(kernel1, 0, sizeof(in_buffer), &in_buffer);
+    test_error(error, "clSetKernelArg failed");
+
+    error = clSetKernelArg(kernel2, 0, sizeof(out_buffer), &out_buffer);
+    test_error(error, "clSetKernelArg failed");
+
+    size_t global_size = 1;
+    error = clEnqueueNDRangeKernel(queue, kernel1, 1, nullptr, &global_size,
+                                   nullptr, 0, nullptr, nullptr);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    error = clEnqueueNDRangeKernel(queue, kernel2, 1, nullptr, &global_size,
+                                   nullptr, 0, nullptr, nullptr);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    error = clEnqueueReadBuffer(queue, out_buffer, CL_BLOCKING, 0,
+                                sizeof(value), &value, 0, nullptr, nullptr);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    error = clFinish(queue);
+    test_error(error, "clFinish failed");
+
+    if (value != 28)
+    {
+        log_error("ERROR: Kernel wrote %lu, expected 28\n",
+                  static_cast<long unsigned>(value));
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
+
+int test_cxx_for_opencl_ext(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int)
+{
+    if (!is_extension_available(device, "cl_ext_cxx_for_opencl"))
+    {
+        log_info("Device does not support 'cl_ext_cxx_for_opencl'. Skipping "
+                 "the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    return test_cxx_for_opencl(device, context, queue);
+}
diff --git a/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ver.cpp b/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ver.cpp
new file mode 100644
index 0000000000..0376081462
--- /dev/null
+++ b/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ver.cpp
@@ -0,0 +1,102 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+
+
+int test_cxx_for_opencl_version(cl_device_id device, cl_context context,
+                                cl_command_queue queue)
+{
+    cl_int cxx4opencl_version;
+    cl_int cxx4opencl_expected_version;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    cl_int error;
+    cl_int value = 0;
+    const char *kernel_sstr =
+        R"(
+        __kernel void k(__global int* buf) {
+            buf[0] = __OPENCL_CPP_VERSION__;
+        })";
+    const size_t lengths[1] = { std::string{ kernel_sstr }.size() };
+
+    clProgramWrapper writer_program =
+        clCreateProgramWithSource(context, 1, &kernel_sstr, lengths, &error);
+    test_error(error, "Failed to create program with source");
+
+    error = clCompileProgram(writer_program, 1, &device, "-cl-std=CLC++", 0,
+                             nullptr, nullptr, nullptr, nullptr);
+    test_error(error, "Failed to compile program");
+
+    cl_program progs[1] = { writer_program };
+    program = clLinkProgram(context, 1, &device, "", 1, progs, 0, 0, &error);
+    test_error(error, "Failed to link program");
+
+    clMemWrapper out =
+        clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                       sizeof(cxx4opencl_version), &cxx4opencl_version, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    kernel = clCreateKernel(program, "k", &error);
+    test_error(error, "Failed to create k kernel");
+
+    error = clSetKernelArg(kernel, 0, sizeof(out), &out);
+    test_error(error, "clSetKernelArg failed");
+
+    size_t global_size = 1;
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, &global_size,
+                                   nullptr, 0, nullptr, nullptr);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    error = clEnqueueReadBuffer(queue, out, CL_BLOCKING, 0,
+                                sizeof(cxx4opencl_version), &cxx4opencl_version,
+                                0, nullptr, nullptr);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    error = clFinish(queue);
+    test_error(error, "clFinish failed");
+
+    error =
+        clGetDeviceInfo(device, CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT,
+                        sizeof(value), &value, nullptr);
+    test_error(error, "Failed to get device info");
+
+    cxx4opencl_expected_version = CL_VERSION_MAJOR_KHR(value) * 100
+        + CL_VERSION_MINOR_KHR(value) * 10 + CL_VERSION_PATCH_KHR(value);
+
+    if (cxx4opencl_version != cxx4opencl_expected_version)
+    {
+        log_error("ERROR: C++ for OpenCL version mismatch - returned %lu, "
+                  "expected %lu\n",
+                  static_cast<long unsigned>(value),
+                  static_cast<long unsigned>(cxx4opencl_expected_version));
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
+
+int test_cxx_for_opencl_ver(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int)
+{
+    if (!is_extension_available(device, "cl_ext_cxx_for_opencl"))
+    {
+        log_info("Device does not support 'cl_ext_cxx_for_opencl'. Skipping "
+                 "the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    return test_cxx_for_opencl_version(device, context, queue);
+}
diff --git a/test_conformance/extensions/cl_ext_cxx_for_opencl/main.cpp b/test_conformance/extensions/cl_ext_cxx_for_opencl/main.cpp
new file mode 100644
index 0000000000..5e8c14af76
--- /dev/null
+++ b/test_conformance/extensions/cl_ext_cxx_for_opencl/main.cpp
@@ -0,0 +1,28 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "procs.h"
+
+test_definition test_list[] = {
+    ADD_TEST_VERSION(cxx_for_opencl_ext, Version(2, 0)),
+    ADD_TEST_VERSION(cxx_for_opencl_ver, Version(2, 0))
+};
+
+int main(int argc, const char *argv[])
+{
+    return runTestHarnessWithCheck(argc, argv, ARRAY_SIZE(test_list), test_list,
+                                   false, 0, nullptr);
+}
diff --git a/test_conformance/extensions/cl_ext_cxx_for_opencl/procs.h b/test_conformance/extensions/cl_ext_cxx_for_opencl/procs.h
new file mode 100644
index 0000000000..5665e012ff
--- /dev/null
+++ b/test_conformance/extensions/cl_ext_cxx_for_opencl/procs.h
@@ -0,0 +1,26 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef _procs_h
+#define _procs_h
+
+#include "harness/typeWrappers.h"
+
+extern int test_cxx_for_opencl_ext(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int);
+extern int test_cxx_for_opencl_ver(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int);
+
+#endif /*_procs_h*/

From e5f89249fa2ac24dd8cc57b5d1f022025c9d2819 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 14 Jan 2021 13:27:18 +0000
Subject: [PATCH 016/158] Apply clang-format on math_brute_force (#1104)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/FunctionList.cpp         |   79 +-
 .../math_brute_force/FunctionList.h           |  103 +-
 test_conformance/math_brute_force/Sleep.cpp   |  141 +-
 test_conformance/math_brute_force/Sleep.h     |    8 +-
 test_conformance/math_brute_force/Utility.cpp |   98 +-
 test_conformance/math_brute_force/Utility.h   |  217 +-
 test_conformance/math_brute_force/binary.cpp  | 1905 +++---
 .../math_brute_force/binaryOperator.cpp       | 1815 ++++--
 .../math_brute_force/binary_i.cpp             | 1559 +++--
 .../math_brute_force/binary_two_results_i.cpp | 1409 ++--
 test_conformance/math_brute_force/i_unary.cpp |  659 +-
 .../math_brute_force/macro_binary.cpp         | 1420 +++--
 .../math_brute_force/macro_unary.cpp          | 1063 ++--
 test_conformance/math_brute_force/mad.cpp     | 1626 ++---
 test_conformance/math_brute_force/main.cpp    | 1762 ++---
 .../math_brute_force/reference_math.cpp       | 5657 +++++++++--------
 .../math_brute_force/reference_math.h         |  398 +-
 test_conformance/math_brute_force/ternary.cpp | 1733 +++--
 test_conformance/math_brute_force/unary.cpp   | 1258 ++--
 .../math_brute_force/unary_two_results.cpp    | 1117 ++--
 .../math_brute_force/unary_two_results_i.cpp  |  861 ++-
 test_conformance/math_brute_force/unary_u.cpp |  741 ++-
 22 files changed, 14802 insertions(+), 10827 deletions(-)

diff --git a/test_conformance/math_brute_force/FunctionList.cpp b/test_conformance/math_brute_force/FunctionList.cpp
index a07fa06974..c5185c6fa6 100644
--- a/test_conformance/math_brute_force/FunctionList.cpp
+++ b/test_conformance/math_brute_force/FunctionList.cpp
@@ -16,13 +16,13 @@
 #include "FunctionList.h"
 #include "reference_math.h"
 
-#define FTZ_ON  1
+#define FTZ_ON 1
 #define FTZ_OFF 0
-#define EXACT    0.0f
+#define EXACT 0.0f
 #define RELAXED_ON 1
 #define RELAXED_OFF 0
 
-#define STRINGIFY( _s)                  #_s
+#define STRINGIFY(_s) #_s
 
 // Only use ulps information in spir test
 #ifdef FUNCTION_LIST_ULPS_ONLY
@@ -51,25 +51,25 @@
         STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
-#define unaryF                NULL
-#define i_unaryF              NULL
-#define unaryF_u              NULL
-#define macro_unaryF          NULL
-#define binaryF               NULL
-#define binaryF_nextafter     NULL
-#define binaryOperatorF       NULL
-#define binaryF_i             NULL
-#define macro_binaryF         NULL
-#define ternaryF              NULL
-#define unaryF_two_results    NULL
-#define unaryF_two_results_i  NULL
+#define unaryF NULL
+#define i_unaryF NULL
+#define unaryF_u NULL
+#define macro_unaryF NULL
+#define binaryF NULL
+#define binaryF_nextafter NULL
+#define binaryOperatorF NULL
+#define binaryF_i NULL
+#define macro_binaryF NULL
+#define ternaryF NULL
+#define unaryF_two_results NULL
+#define unaryF_two_results_i NULL
 #define binaryF_two_results_i NULL
-#define mad_function          NULL
+#define mad_function NULL
 
-#define reference_sqrt        NULL
-#define reference_sqrtl       NULL
-#define reference_divide      NULL
-#define reference_dividel     NULL
+#define reference_sqrt NULL
+#define reference_sqrtl NULL
+#define reference_divide NULL
+#define reference_dividel NULL
 #define reference_relaxed_divide NULL
 
 #else // FUNCTION_LIST_ULPS_ONLY
@@ -102,24 +102,27 @@
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
 
-extern const vtbl _unary;               // float foo( float )
-extern const vtbl _unary_u;             // float foo( uint ),  double foo( ulong )
-extern const vtbl _i_unary;             // int foo( float )
-extern const vtbl _macro_unary;         // int foo( float ),  returns {0,1} for scalar, { 0, -1 } for vector
-extern const vtbl _binary;              // float foo( float, float )
-extern const vtbl _binary_nextafter;    // float foo( float, float ), special handling for nextafter
-extern const vtbl _binary_operator;     // float .op. float
-extern const vtbl _macro_binary;        // int foo( float, float ), returns {0,1} for scalar, { 0, -1 } for vector
-extern const vtbl _binary_i;            // float foo( float, int )
-extern const vtbl _ternary;             // float foo( float, float, float )
-extern const vtbl _unary_two_results;   // float foo( float, float * )
+extern const vtbl _unary; // float foo( float )
+extern const vtbl _unary_u; // float foo( uint ),  double foo( ulong )
+extern const vtbl _i_unary; // int foo( float )
+extern const vtbl _macro_unary; // int foo( float ),  returns {0,1} for scalar,
+                                // { 0, -1 } for vector
+extern const vtbl _binary; // float foo( float, float )
+extern const vtbl _binary_nextafter; // float foo( float, float ), special
+                                     // handling for nextafter
+extern const vtbl _binary_operator; // float .op. float
+extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for
+                                 // scalar, { 0, -1 } for vector
+extern const vtbl _binary_i; // float foo( float, int )
+extern const vtbl _ternary; // float foo( float, float, float )
+extern const vtbl _unary_two_results; // float foo( float, float * )
 extern const vtbl _unary_two_results_i; // float foo( float, int * )
 extern const vtbl _binary_two_results_i; // float foo( float, float, int * )
-extern const vtbl _mad_tbl;             // float mad( float, float, float )
+extern const vtbl _mad_tbl; // float mad( float, float, float )
 
 #define unaryF &_unary
 #define i_unaryF &_i_unary
-#define unaryF_u  &_unary_u
+#define unaryF_u &_unary_u
 #define macro_unaryF &_macro_unary
 #define binaryF &_binary
 #define binaryF_nextafter &_binary_nextafter
@@ -127,10 +130,10 @@ extern const vtbl _mad_tbl;             // float mad( float, float, float )
 #define binaryF_i &_binary_i
 #define macro_binaryF &_macro_binary
 #define ternaryF &_ternary
-#define unaryF_two_results  &_unary_two_results
-#define unaryF_two_results_i  &_unary_two_results_i
-#define binaryF_two_results_i  &_binary_two_results_i
-#define mad_function        &_mad_tbl
+#define unaryF_two_results &_unary_two_results
+#define unaryF_two_results_i &_unary_two_results_i
+#define binaryF_two_results_i &_binary_two_results_i
+#define mad_function &_mad_tbl
 
 #endif // FUNCTION_LIST_ULPS_ONLY
 
@@ -325,4 +328,4 @@ const Func functionList[] = {
     OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
 };
 
-const size_t functionListCount = sizeof( functionList ) / sizeof( functionList[0] );
+const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]);
diff --git a/test_conformance/math_brute_force/FunctionList.h b/test_conformance/math_brute_force/FunctionList.h
index c22bceebba..e47eb72923 100644
--- a/test_conformance/math_brute_force/FunctionList.h
+++ b/test_conformance/math_brute_force/FunctionList.h
@@ -22,80 +22,77 @@
 #include <unistd.h>
 #endif
 
-#if defined( __APPLE__ )
-    #include <OpenCL/opencl.h>
+#if defined(__APPLE__)
+#include <OpenCL/opencl.h>
 #else
-    #include <CL/cl.h>
+#include <CL/cl.h>
 #endif
 
 #include "harness/mt19937.h"
 
-typedef union fptr
-{
-    void    *p;
-    double  (*f_f)(double);
-    double  (*f_u)(cl_uint);
-    int     (*i_f)(double);
-    int     (*i_f_f)(float);
-    float   (*f_ff_f)(float, float);
-    double  (*f_ff)(double, double);
-    int     (*i_ff)(double, double);
-    double  (*f_fi)(double, int);
-    double  (*f_fpf)(double, double*);
-    double  (*f_fpI)(double, int*);
-    double  (*f_ffpI)(double, double, int*);
-    double  (*f_fff)(double, double, double );
-    float   (*f_fma)(float, float, float, int);
-}fptr;
+typedef union fptr {
+    void *p;
+    double (*f_f)(double);
+    double (*f_u)(cl_uint);
+    int (*i_f)(double);
+    int (*i_f_f)(float);
+    float (*f_ff_f)(float, float);
+    double (*f_ff)(double, double);
+    int (*i_ff)(double, double);
+    double (*f_fi)(double, int);
+    double (*f_fpf)(double, double *);
+    double (*f_fpI)(double, int *);
+    double (*f_ffpI)(double, double, int *);
+    double (*f_fff)(double, double, double);
+    float (*f_fma)(float, float, float, int);
+} fptr;
 
-typedef union dptr
-{
-    void            *p;
-    long double     (*f_f)(long double);
-    long double     (*f_u)(cl_ulong);
-    int             (*i_f)(long double);
-    long double     (*f_ff)(long double, long double);
-    int             (*i_ff)(long double, long double);
-    long double     (*f_fi)(long double, int);
-    long double     (*f_fpf)(long double, long double*);
-    long double     (*f_fpI)(long double, int*);
-    long double     (*f_ffpI)(long double, long double, int*);
-    long double     (*f_fff)(long double, long double, long double);
-}dptr;
+typedef union dptr {
+    void *p;
+    long double (*f_f)(long double);
+    long double (*f_u)(cl_ulong);
+    int (*i_f)(long double);
+    long double (*f_ff)(long double, long double);
+    int (*i_ff)(long double, long double);
+    long double (*f_fi)(long double, int);
+    long double (*f_fpf)(long double, long double *);
+    long double (*f_fpI)(long double, int *);
+    long double (*f_ffpI)(long double, long double, int *);
+    long double (*f_fff)(long double, long double, long double);
+} dptr;
 
 struct Func;
 
 typedef struct vtbl
 {
-    const char  *type_name;
+    const char *type_name;
     int (*TestFunc)(const struct Func *, MTdata, bool);
     int (*DoubleTestFunc)(
         const struct Func *, MTdata,
         bool); // may be NULL if function is single precision only
-}vtbl;
+} vtbl;
 
 typedef struct Func
 {
-  const char      *name;              // common name, to be used as an argument in the shell
-  const char      *nameInCode;        // name as it appears in the __kernel, usually the same as name, but different for multiplication
-  fptr            func;
-  dptr            dfunc;
-  fptr            rfunc;
-  float           float_ulps;
-  float           double_ulps;
-  float           float_embedded_ulps;
-  float           relaxed_error;
-  float relaxed_embedded_error;
-  int             ftz;
-  int             relaxed;
-  const vtbl      *vtbl_ptr;
-}Func;
+    const char *name; // common name, to be used as an argument in the shell
+    const char *nameInCode; // name as it appears in the __kernel, usually the
+                            // same as name, but different for multiplication
+    fptr func;
+    dptr dfunc;
+    fptr rfunc;
+    float float_ulps;
+    float double_ulps;
+    float float_embedded_ulps;
+    float relaxed_error;
+    float relaxed_embedded_error;
+    int ftz;
+    int relaxed;
+    const vtbl *vtbl_ptr;
+} Func;
 
 
-extern const Func  functionList[];
+extern const Func functionList[];
 
 extern const size_t functionListCount;
 
 #endif
-
-
diff --git a/test_conformance/math_brute_force/Sleep.cpp b/test_conformance/math_brute_force/Sleep.cpp
index 4d3b2c64b5..7103779e41 100644
--- a/test_conformance/math_brute_force/Sleep.cpp
+++ b/test_conformance/math_brute_force/Sleep.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,103 +16,94 @@
 #include "Sleep.h"
 #include "Utility.h"
 
-#if defined( __APPLE__ )
-    #include <IOKit/pwr_mgt/IOPMLib.h>
-    #include <IOKit/IOMessage.h>
+#if defined(__APPLE__)
+#include <IOKit/pwr_mgt/IOPMLib.h>
+#include <IOKit/IOMessage.h>
 
-    struct
-    {
-        io_connect_t            connection;
-        IONotificationPortRef    port;
-        io_object_t                iterator;
-    }sleepInfo;
+struct
+{
+    io_connect_t connection;
+    IONotificationPortRef port;
+    io_object_t iterator;
+} sleepInfo;
 
-    void sleepCallback(    void *            refcon,
-                        io_service_t        service,
-                        natural_t        messageType,
-                        void *            messageArgument );
+void sleepCallback(void* refcon, io_service_t service, natural_t messageType,
+                   void* messageArgument);
 
-    void sleepCallback(    void *            refcon UNUSED,
-                        io_service_t        service UNUSED,
-                        natural_t        messageType,
-                        void *            messageArgument )
-    {
+void sleepCallback(void* refcon UNUSED, io_service_t service UNUSED,
+                   natural_t messageType, void* messageArgument)
+{
 
-        IOReturn result;
+    IOReturn result;
     /*
     service -- The IOService whose state has changed.
-    messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the IOService's family.
-    messageArgument -- An argument for the message, dependent on the messageType.
+    messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the
+    IOService's family. messageArgument -- An argument for the message,
+    dependent on the messageType.
     */
-        switch ( messageType )
-        {
-            case kIOMessageSystemWillSleep:
-                // Handle demand sleep (such as sleep caused by running out of
-                // batteries, closing the lid of a laptop, or selecting
-                // sleep from the Apple menu.
-                IOAllowPowerChange(sleepInfo.connection,(long)messageArgument);
-                vlog( "Hard sleep occurred.\n" );
-                break;
-            case kIOMessageCanSystemSleep:
-                // In this case, the computer has been idle for several minutes
-                // and will sleep soon so you must either allow or cancel
-                // this notification. Important: if you don’t respond, there will
-                // be a 30-second timeout before the computer sleeps.
-                // IOCancelPowerChange(root_port,(long)messageArgument);
-                result = IOCancelPowerChange(sleepInfo.connection,(long)messageArgument);
-                if( kIOReturnSuccess != result )
-                    vlog( "sleep prevention failed. (%d)\n", result);
+    switch (messageType)
+    {
+        case kIOMessageSystemWillSleep:
+            // Handle demand sleep (such as sleep caused by running out of
+            // batteries, closing the lid of a laptop, or selecting
+            // sleep from the Apple menu.
+            IOAllowPowerChange(sleepInfo.connection, (long)messageArgument);
+            vlog("Hard sleep occurred.\n");
+            break;
+        case kIOMessageCanSystemSleep:
+            // In this case, the computer has been idle for several minutes
+            // and will sleep soon so you must either allow or cancel
+            // this notification. Important: if you don’t respond, there will
+            // be a 30-second timeout before the computer sleeps.
+            // IOCancelPowerChange(root_port,(long)messageArgument);
+            result = IOCancelPowerChange(sleepInfo.connection,
+                                         (long)messageArgument);
+            if (kIOReturnSuccess != result)
+                vlog("sleep prevention failed. (%d)\n", result);
+            break;
+        case kIOMessageSystemHasPoweredOn:
+            // Handle wakeup.
             break;
-            case kIOMessageSystemHasPoweredOn:
-                // Handle wakeup.
-                break;
-        }
     }
+}
 #endif
 
 
-
-
-
-void PreventSleep( void )
+void PreventSleep(void)
 {
-#if defined( __APPLE__ )
-    vlog( "Disabling sleep... " );
-    sleepInfo.iterator = (io_object_t) 0;
+#if defined(__APPLE__)
+    vlog("Disabling sleep... ");
+    sleepInfo.iterator = (io_object_t)0;
     sleepInfo.port = NULL;
-    sleepInfo.connection = IORegisterForSystemPower
-                            (
-                                &sleepInfo,                    //void * refcon,
-                                &sleepInfo.port,            //IONotificationPortRef * thePortRef,
-                                sleepCallback,                //IOServiceInterestCallback callback,
-                                &sleepInfo.iterator            //io_object_t * notifier
-                            );
-
-    if( (io_connect_t) 0 == sleepInfo.connection )
-        vlog( "failed.\n" );
+    sleepInfo.connection = IORegisterForSystemPower(
+        &sleepInfo, // void * refcon,
+        &sleepInfo.port, // IONotificationPortRef * thePortRef,
+        sleepCallback, // IOServiceInterestCallback callback,
+        &sleepInfo.iterator // io_object_t * notifier
+    );
+
+    if ((io_connect_t)0 == sleepInfo.connection)
+        vlog("failed.\n");
     else
-        vlog( "done.\n" );
+        vlog("done.\n");
 
     CFRunLoopAddSource(CFRunLoopGetCurrent(),
-                        IONotificationPortGetRunLoopSource(sleepInfo.port),
-                        kCFRunLoopDefaultMode);
+                       IONotificationPortGetRunLoopSource(sleepInfo.port),
+                       kCFRunLoopDefaultMode);
 #else
-    vlog( "*** PreventSleep() is not implemented on this platform.\n" );
+    vlog("*** PreventSleep() is not implemented on this platform.\n");
 #endif
 }
 
-void ResumeSleep( void )
+void ResumeSleep(void)
 {
-#if defined( __APPLE__ )
-    IOReturn result = IODeregisterForSystemPower ( &sleepInfo.iterator );
-    if( 0 != result )
-        vlog( "Got error %d restoring sleep \n", result );
+#if defined(__APPLE__)
+    IOReturn result = IODeregisterForSystemPower(&sleepInfo.iterator);
+    if (0 != result)
+        vlog("Got error %d restoring sleep \n", result);
     else
-        vlog( "Sleep restored.\n" );
+        vlog("Sleep restored.\n");
 #else
-    vlog( "*** ResumeSleep() is not implemented on this platform.\n" );
+    vlog("*** ResumeSleep() is not implemented on this platform.\n");
 #endif
 }
-
-
-
diff --git a/test_conformance/math_brute_force/Sleep.h b/test_conformance/math_brute_force/Sleep.h
index f983a32fd1..ca643954f4 100644
--- a/test_conformance/math_brute_force/Sleep.h
+++ b/test_conformance/math_brute_force/Sleep.h
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,9 +16,7 @@
 #ifndef SLEEP_H
 #define SLEEP_H
 
-void PreventSleep( void );
-void ResumeSleep( void );
+void PreventSleep(void);
+void ResumeSleep(void);
 
 #endif /* SLEEP_H */
-
-
diff --git a/test_conformance/math_brute_force/Utility.cpp b/test_conformance/math_brute_force/Utility.cpp
index 9ab7c7fa6b..3d8d9baab2 100644
--- a/test_conformance/math_brute_force/Utility.cpp
+++ b/test_conformance/math_brute_force/Utility.cpp
@@ -17,9 +17,9 @@
 #include "FunctionList.h"
 
 #if defined(__PPC__)
-// Global varaiable used to hold the FPU control register state. The FPSCR register can not
-// be used because not all Power implementations retain or observed the NI (non-IEEE
-// mode) bit.
+// Global varaiable used to hold the FPU control register state. The FPSCR
+// register can not be used because not all Power implementations retain or
+// observed the NI (non-IEEE mode) bit.
 __thread fpu_control_t fpu_control = 0;
 #endif
 
@@ -28,16 +28,16 @@ void MulD(double *rhi, double *rlo, double u, double v)
     const double c = 134217729.0; // 1+2^27
     double up, u1, u2, vp, v1, v2;
 
-    up = u*c;
+    up = u * c;
     u1 = (u - up) + up;
     u2 = u - u1;
 
-    vp = v*c;
+    vp = v * c;
     v1 = (v - vp) + vp;
     v2 = v - v1;
 
-    double rh = u*v;
-    double rl = (((u1*v1 - rh) + (u1*v2)) + (u2*v1)) + (u2*v2);
+    double rh = u * v;
+    double rl = (((u1 * v1 - rh) + (u1 * v2)) + (u2 * v1)) + (u2 * v2);
 
     *rhi = rh;
     *rlo = rl;
@@ -47,11 +47,13 @@ void AddD(double *rhi, double *rlo, double a, double b)
 {
     double zhi, zlo;
     zhi = a + b;
-    if(fabs(a) > fabs(b)) {
+    if (fabs(a) > fabs(b))
+    {
         zlo = zhi - a;
         zlo = b - zlo;
     }
-    else {
+    else
+    {
         zlo = zhi - b;
         zlo = a - zlo;
     }
@@ -66,17 +68,17 @@ void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
     double c = 134217729.0;
     double up, u1, u2, vp, v1, v2;
 
-    up = xh*c;
+    up = xh * c;
     u1 = (xh - up) + up;
     u2 = xh - u1;
 
-    vp = yh*c;
+    vp = yh * c;
     v1 = (yh - vp) + vp;
     v2 = yh - v1;
 
-    mh = xh*yh;
-    ml = (((u1*v1 - mh) + (u1*v2)) + (u2*v1)) + (u2*v2);
-    ml += xh*yl + xl*yh;
+    mh = xh * yh;
+    ml = (((u1 * v1 - mh) + (u1 * v2)) + (u2 * v1)) + (u2 * v2);
+    ml += xh * yl + xl * yh;
 
     *rhi = mh + ml;
     *rlo = (mh - (*rhi)) + ml;
@@ -86,7 +88,8 @@ void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
 {
     double r, s;
     r = xh + yh;
-    s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) : (yh - r + xh + xl + yl);
+    s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl)
+                              : (yh - r + xh + xl + yl);
     *rhi = r + s;
     *rlo = (r - (*rhi)) + s;
 }
@@ -100,72 +103,61 @@ void DivideDD(double *chi, double *clo, double a, double b)
     *clo = rhi / b;
 }
 
-// These functions comapre two floats/doubles. Since some platforms may choose to
-// flush denormals to zeros before comparison, comparison like a < b may give wrong
-// result in "certain cases" where we do need correct compasion result when operands
-// are denormals .... these functions comapre floats/doubles using signed integer/long int
-// rep. In other cases, when flushing to zeros is fine, these should not be used.
-// Also these doesn't check for nans and assume nans are handled separately as special edge case
-// by the caller which calls these functions
-// return 0 if both are equal, 1 if x > y and -1 if x < y.
-
-inline
-int compareFloats(float x, float y)
+// These functions comapre two floats/doubles. Since some platforms may choose
+// to flush denormals to zeros before comparison, comparison like a < b may give
+// wrong result in "certain cases" where we do need correct compasion result
+// when operands are denormals .... these functions comapre floats/doubles using
+// signed integer/long int rep. In other cases, when flushing to zeros is fine,
+// these should not be used. Also these doesn't check for nans and assume nans
+// are handled separately as special edge case by the caller which calls these
+// functions return 0 if both are equal, 1 if x > y and -1 if x < y.
+
+inline int compareFloats(float x, float y)
 {
     int32f_t a, b;
 
     a.f = x;
     b.f = y;
 
-    if( a.i & 0x80000000 )
-        a.i = 0x80000000 - a.i;
-    if( b.i & 0x80000000 )
-        b.i = 0x80000000 - b.i;
+    if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
+    if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
 
-    if( a.i == b.i )
-        return 0;
+    if (a.i == b.i) return 0;
 
     return a.i < b.i ? -1 : 1;
 }
 
-inline
-int compareDoubles(double x, double y)
+inline int compareDoubles(double x, double y)
 {
     int64d_t a, b;
 
     a.d = x;
     b.d = y;
 
-    if( a.l & 0x8000000000000000LL )
-        a.l = 0x8000000000000000LL - a.l;
-    if( b.l & 0x8000000000000000LL )
-        b.l = 0x8000000000000000LL - b.l;
+    if (a.l & 0x8000000000000000LL) a.l = 0x8000000000000000LL - a.l;
+    if (b.l & 0x8000000000000000LL) b.l = 0x8000000000000000LL - b.l;
 
-    if( a.l == b.l )
-        return 0;
+    if (a.l == b.l) return 0;
 
     return a.l < b.l ? -1 : 1;
 }
 
-void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed)
+void logFunctionInfo(const char *fname, unsigned int float_size,
+                     unsigned int isFastRelaxed)
 {
     char const *fpSizeStr = NULL;
     char const *fpFastRelaxedStr = "";
-    switch (float_size) {
-    case sizeof(cl_double):
-        fpSizeStr = "fp64";
-        break;
-    case sizeof(cl_float):
-        fpSizeStr = "fp32";
-        break;
-    case sizeof(cl_half):
-        fpSizeStr = "fp16";
-        break;
+    switch (float_size)
+    {
+        case sizeof(cl_double): fpSizeStr = "fp64"; break;
+        case sizeof(cl_float): fpSizeStr = "fp32"; break;
+        case sizeof(cl_half): fpSizeStr = "fp16"; break;
     }
-    if (isFastRelaxed) {
+    if (isFastRelaxed)
+    {
         fpFastRelaxedStr = "rlx";
     }
-    vlog("%15s %4s %4s",fname, fpSizeStr, fpFastRelaxedStr);
+    vlog("%15s %4s %4s", fname, fpSizeStr, fpFastRelaxedStr);
 }
 
 float getAllowedUlpError(const Func *f, const bool relaxed)
diff --git a/test_conformance/math_brute_force/Utility.h b/test_conformance/math_brute_force/Utility.h
index 92f8f3dc94..dd3c5e5633 100644
--- a/test_conformance/math_brute_force/Utility.h
+++ b/test_conformance/math_brute_force/Utility.h
@@ -30,13 +30,13 @@
 #include "harness/ThreadPool.h"
 #include "harness/conversions.h"
 
-#define BUFFER_SIZE         (1024*1024*2)
+#define BUFFER_SIZE (1024 * 1024 * 2)
 #define EMBEDDED_REDUCTION_FACTOR (64)
 
-#if defined( __GNUC__ )
-    #define UNUSED  __attribute__ ((unused))
+#if defined(__GNUC__)
+#define UNUSED __attribute__((unused))
 #else
-    #define UNUSED
+#define UNUSED
 #endif
 
 struct Func;
@@ -44,62 +44,62 @@ struct Func;
 extern int gWimpyBufferSize;
 extern int gWimpyReductionFactor;
 
-#define VECTOR_SIZE_COUNT   6
+#define VECTOR_SIZE_COUNT 6
 extern const char *sizeNames[VECTOR_SIZE_COUNT];
-extern const int   sizeValues[VECTOR_SIZE_COUNT];
+extern const int sizeValues[VECTOR_SIZE_COUNT];
 
-extern cl_device_id     gDevice;
-extern cl_context       gContext;
+extern cl_device_id gDevice;
+extern cl_context gContext;
 extern cl_command_queue gQueue;
-extern void             *gIn;
-extern void             *gIn2;
-extern void             *gIn3;
-extern void             *gOut_Ref;
-extern void             *gOut_Ref2;
-extern void             *gOut[VECTOR_SIZE_COUNT];
-extern void             *gOut2[VECTOR_SIZE_COUNT];
-extern cl_mem           gInBuffer;
-extern cl_mem           gInBuffer2;
-extern cl_mem           gInBuffer3;
-extern cl_mem           gOutBuffer[VECTOR_SIZE_COUNT];
-extern cl_mem           gOutBuffer2[VECTOR_SIZE_COUNT];
-extern uint32_t         gComputeDevices;
-extern uint32_t         gSimdSize;
-extern int              gSkipCorrectnessTesting;
-extern int              gMeasureTimes;
-extern int              gReportAverageTimes;
-extern int              gForceFTZ;
-extern int              gFastRelaxedDerived;
-extern int              gWimpyMode;
-extern int              gHasDouble;
-extern int              gIsInRTZMode;
-extern int              gInfNanSupport;
-extern int              gIsEmbedded;
-extern int              gVerboseBruteForce;
-extern uint32_t         gMaxVectorSizeIndex;
-extern uint32_t         gMinVectorSizeIndex;
-extern uint32_t         gDeviceFrequency;
+extern void *gIn;
+extern void *gIn2;
+extern void *gIn3;
+extern void *gOut_Ref;
+extern void *gOut_Ref2;
+extern void *gOut[VECTOR_SIZE_COUNT];
+extern void *gOut2[VECTOR_SIZE_COUNT];
+extern cl_mem gInBuffer;
+extern cl_mem gInBuffer2;
+extern cl_mem gInBuffer3;
+extern cl_mem gOutBuffer[VECTOR_SIZE_COUNT];
+extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT];
+extern uint32_t gComputeDevices;
+extern uint32_t gSimdSize;
+extern int gSkipCorrectnessTesting;
+extern int gMeasureTimes;
+extern int gReportAverageTimes;
+extern int gForceFTZ;
+extern int gFastRelaxedDerived;
+extern int gWimpyMode;
+extern int gHasDouble;
+extern int gIsInRTZMode;
+extern int gInfNanSupport;
+extern int gIsEmbedded;
+extern int gVerboseBruteForce;
+extern uint32_t gMaxVectorSizeIndex;
+extern uint32_t gMinVectorSizeIndex;
+extern uint32_t gDeviceFrequency;
 extern cl_device_fp_config gFloatCapabilities;
 extern cl_device_fp_config gDoubleCapabilities;
 
-#define LOWER_IS_BETTER     0
-#define HIGHER_IS_BETTER    1
+#define LOWER_IS_BETTER 0
+#define HIGHER_IS_BETTER 1
 
 #include "harness/errorHelpers.h"
 
-#if defined (_MSC_VER )
-    //Deal with missing scalbn on windows
-    #define scalbnf( _a, _i )       ldexpf( _a, _i )
-    #define scalbn( _a, _i )        ldexp( _a, _i )
-    #define scalbnl( _a, _i )       ldexpl( _a, _i )
+#if defined(_MSC_VER)
+// Deal with missing scalbn on windows
+#define scalbnf(_a, _i) ldexpf(_a, _i)
+#define scalbn(_a, _i) ldexp(_a, _i)
+#define scalbnl(_a, _i) ldexpl(_a, _i)
 #endif
 
-float Abs_Error( float test, double reference );
-float Ulp_Error( float test, double reference );
-float Bruteforce_Ulp_Error_Double( double test, long double reference );
+float Abs_Error(float test, double reference);
+float Ulp_Error(float test, double reference);
+float Bruteforce_Ulp_Error_Double(double test, long double reference);
 
-uint64_t GetTime( void );
-double SubtractTime( uint64_t endTime, uint64_t startTime );
+uint64_t GetTime(void);
+double SubtractTime(uint64_t endTime, uint64_t startTime);
 int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k,
                cl_program *p, bool relaxedMode);
 int MakeKernels(const char **c, cl_uint count, const char *name,
@@ -107,69 +107,84 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
                 bool relaxedMode);
 
 // used to convert a bucket of bits into a search pattern through double
-static inline double DoubleFromUInt32( uint32_t bits );
-static inline double DoubleFromUInt32( uint32_t bits )
+static inline double DoubleFromUInt32(uint32_t bits);
+static inline double DoubleFromUInt32(uint32_t bits)
 {
-    union{ uint64_t u; double d;} u;
+    union {
+        uint64_t u;
+        double d;
+    } u;
 
     // split 0x89abcdef to 0x89abc00000000def
     u.u = bits & 0xfffU;
-    u.u |= (uint64_t) (bits & ~0xfffU) << 32;
+    u.u |= (uint64_t)(bits & ~0xfffU) << 32;
 
-    // sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s
+    // sign extend the leading bit of def segment as sign bit so that the middle
+    // region consists of either all 1s or 0s
     u.u -= (bits & 0x800U) << 1;
 
     // return result
     return u.d;
 }
 
-void _LogBuildError( cl_program p, int line, const char *file );
-#define LogBuildError( program )        _LogBuildError( program, __LINE__, __FILE__ )
+void _LogBuildError(cl_program p, int line, const char *file);
+#define LogBuildError(program) _LogBuildError(program, __LINE__, __FILE__)
 
 #define PERF_LOOP_COUNT 100
 
-//The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero.
-// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero.
-static inline int IsFloatResultSubnormal( double x, float ulps )
+// The spec is fairly clear that we may enforce a hard cutoff to prevent
+// premature flushing to zero.
+// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +
+// ulp_limit to be flushed to zero.
+static inline int IsFloatResultSubnormal(double x, float ulps)
 {
-    x = fabs(x) - MAKE_HEX_DOUBLE( 0x1.0p-149, 0x1, -149) * (double) ulps;
-    return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
+    x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps;
+    return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
-static inline int IsFloatResultSubnormalAbsError( double x , float abs_err)
+static inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
 {
-  x = x - abs_err;
-  return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
+    x = x - abs_err;
+    return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
-static inline int IsDoubleResultSubnormal( long double x, float ulps )
+static inline int IsDoubleResultSubnormal(long double x, float ulps)
 {
-    x = fabsl(x) - MAKE_HEX_LONG( 0x1.0p-1074, 0x1, -1074) * (long double) ulps;
-    return x < MAKE_HEX_LONG( 0x1.0p-1022, 0x1, -1022 );
+    x = fabsl(x) - MAKE_HEX_LONG(0x1.0p-1074, 0x1, -1074) * (long double)ulps;
+    return x < MAKE_HEX_LONG(0x1.0p-1022, 0x1, -1022);
 }
 
 static inline int IsFloatInfinity(double x)
 {
-  union { cl_float d; cl_uint u; } u;
-  u.d = (cl_float) x;
-  return ((u.u & 0x7fffffffU) == 0x7F800000U);
+    union {
+        cl_float d;
+        cl_uint u;
+    } u;
+    u.d = (cl_float)x;
+    return ((u.u & 0x7fffffffU) == 0x7F800000U);
 }
 
 static inline int IsFloatMaxFloat(double x)
 {
-  union { cl_float d; cl_uint u; } u;
-  u.d = (cl_float) x;
-  return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
+    union {
+        cl_float d;
+        cl_uint u;
+    } u;
+    u.d = (cl_float)x;
+    return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
 }
 
 static inline int IsFloatNaN(double x)
 {
-  union { cl_float d; cl_uint u; } u;
-  u.d = (cl_float) x;
-  return ((u.u & 0x7fffffffU) > 0x7F800000U);
+    union {
+        cl_float d;
+        cl_uint u;
+    } u;
+    u.d = (cl_float)x;
+    return ((u.u & 0x7fffffffU) > 0x7F800000U);
 }
 
-extern cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
+extern cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
 
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
@@ -186,46 +201,50 @@ static inline void Force64BitFPUPrecision(void)
     // divergent code just use inline assembly which works for both.
     unsigned short int orig_cw = 0;
     unsigned short int new_cw = 0;
-    __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
-    new_cw = orig_cw | 0x0300;   // set precision to 64-bit
-    __asm__ __volatile__ ("fldcw  %0"::"m" (new_cw));
-#elif defined( _WIN32 ) && defined( __INTEL_COMPILER )
-    // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* work on win.x64:
-    // > On the x64 architecture, changing the floating point precision is not supported.
-    // (Taken from http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
+    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
+    new_cw = orig_cw | 0x0300; // set precision to 64-bit
+    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
+#elif defined(_WIN32) && defined(__INTEL_COMPILER)
+    // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not*
+    // work on win.x64: > On the x64 architecture, changing the floating point
+    // precision is not supported. (Taken from
+    // http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
     int cw;
-    __asm { fnstcw cw };    // Get current value of FPU control word.
-    cw = cw & 0xfffffcff | ( 3 << 8 ); // Set Precision Control to Double Extended Precision.
-    __asm { fldcw cw };     // Set new value of FPU control word.
+    __asm { fnstcw cw }
+    ; // Get current value of FPU control word.
+    cw = cw & 0xfffffcff
+        | (3 << 8); // Set Precision Control to Double Extended Precision.
+    __asm { fldcw cw }
+    ; // Set new value of FPU control word.
 #else
     /* Implement for other platforms if needed */
 #endif
 }
 
-extern
-void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
+extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
 
-typedef union
-{
+typedef union {
     int32_t i;
-    float   f;
-}int32f_t;
+    float f;
+} int32f_t;
 
-typedef union
-{
+typedef union {
     int64_t l;
-    double  d;
-}int64d_t;
+    double d;
+} int64d_t;
 
 void MulD(double *rhi, double *rlo, double u, double v);
 void AddD(double *rhi, double *rlo, double a, double b);
-void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
-void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
+void MulDD(double *rhi, double *rlo, double xh, double xl, double yh,
+           double yl);
+void AddDD(double *rhi, double *rlo, double xh, double xl, double yh,
+           double yl);
 void DivideDD(double *chi, double *clo, double a, double b);
 int compareFloats(float x, float y);
 int compareDoubles(double x, double y);
 
-void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed);
+void logFunctionInfo(const char *fname, unsigned int float_size,
+                     unsigned int isFastRelaxed);
 
 float getAllowedUlpError(const Func *f, const bool relaxed);
 
diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index 0b8be27b6a..db961c8d77 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -46,63 +46,82 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {     "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       f0 = ", name, "( f0, f1 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, f1 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -112,65 +131,84 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
 {
-    const char *c[] = {     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       d0 = ", name, "( d0, d1 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = ", name, "( d0, d1 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -178,115 +216,215 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
-
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
                              info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value (param 1).  Init to 0.
-    double      maxErrorValue2;                     // position of the max error value (param 2).  Init to 0.
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
-
-    int         isFDim;
-    int         skipNanInf;
-    int         isNextafter;
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isFDim;
+    int skipNanInf;
+    int isNextafter;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
 } TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
                                       bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
-    int         skipTestingRelaxed = 0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+    int skipTestingRelaxed = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode){
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -296,62 +434,83 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
 
-    test_info.isFDim = 0 == strcmp( "fdim", f->nameInCode );
-    test_info.skipNanInf = test_info.isFDim  && ! gInfNanSupport;
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
     test_info.isNextafter = isNextafter;
     test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf2 )
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer2 for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gOutBuffer[%d] for region {%zd, %zd}\n", (int) j, region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -364,19 +523,21 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
     // Run the kernels
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -384,176 +545,200 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
             p2[j] = 0x3fc00000;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            free_mtdata( test_info.tinfo[i].d );
+            free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    fptr        func = job->f->func;
-    int         ftz = job->ftz;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    cl_uchar    *overflow = (cl_uchar*)malloc(buffer_size);
-    const char  *name = job->f->name;
-    int         isFDim = job->isFDim;
-    int         skipNanInf = job->skipNanInf;
-    int         isNextafter = job->isNextafter;
-    cl_uint     *t = 0;
-    float       *r=0,*s=0,*s2=0;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    const char *name = job->f->name;
+    int isFDim = job->isFDim;
+    int skipNanInf = job->skipNanInf;
+    int isNextafter = job->isNextafter;
+    cl_uint *t = 0;
+    float *r = 0, *s = 0, *s2 = 0;
     cl_int copysign_test = 0;
     RoundingMode oldRoundMode;
     int skipVerification = 0;
 
     if (relaxedMode)
     {
-      if (strcmp(name,"pow")==0 && gFastRelaxedDerived)
-      {
-        func = job->f->rfunc;
-        ulps = INFINITY;
-        skipVerification = 1;
-      }else
-      {
-        func = job->f->rfunc;
-      }
+        if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
+        {
+            func = job->f->rfunc;
+            ulps = INFINITY;
+            skipVerification = 1;
+        }
+        else
+        {
+            func = job->f->rfunc;
+        }
     }
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -562,91 +747,111 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
         float *fp2 = (float *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesFloatCount;
-    y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if( ++x >= specialValuesFloatCount )
+            if (++x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesFloatCount )
-                    break;
+                if (y >= specialValuesFloatCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int32(d);
         p2[j] = genrand_int32(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
+    if (gSkipCorrectnessTesting)
     {
-        if( (error = clFinish(tinfo->tQueue)) )
+        if ((error = clFinish(tinfo->tQueue)))
         {
-          vlog_error( "Error: clFinish failed! err: %d\n", error );
-          goto exit;
+            vlog_error("Error: clFinish failed! err: %d\n", error);
+            goto exit;
         }
         free(overflow);
         return CL_SUCCESS;
@@ -654,105 +859,111 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
 
     FPU_mode_type oldMode;
     oldRoundMode = kRoundToNearestEven;
-    if( isFDim )
+    if (isFDim)
     {
-        //Calculate the correctly rounded reference result
-        memset( &oldMode, 0, sizeof( oldMode ) );
-        if( ftz )
-            ForceFTZ( &oldMode );
+        // Calculate the correctly rounded reference result
+        memset(&oldMode, 0, sizeof(oldMode));
+        if (ftz) ForceFTZ(&oldMode);
 
         // Set the rounding mode to match the device
-        if (gIsInRTZMode)
-            oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
     }
 
-    if(!strcmp(name, "copysign"))
-        copysign_test = 1;
+    if (!strcmp(name, "copysign")) copysign_test = 1;
 
-#define ref_func(s, s2) (copysign_test ? func.f_ff_f( s, s2 ) : func.f_ff( s, s2 ))
+#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
 
-    //Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (float *)gIn2  + thread_id * buffer_elements;
-    if( skipNanInf )
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    if (skipNanInf)
     {
-        for( j = 0; j < buffer_elements; j++ )
+        for (j = 0; j < buffer_elements; j++)
         {
             feclearexcept(FE_OVERFLOW);
-            r[j] = (float) ref_func( s[j], s2[j] );
-            overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            r[j] = (float)ref_func(s[j], s2[j]);
+            overflow[j] =
+                FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
         }
     }
     else
     {
-        for( j = 0; j < buffer_elements; j++ )
-            r[j] = (float) ref_func( s[j], s2[j] );
+        for (j = 0; j < buffer_elements; j++)
+            r[j] = (float)ref_func(s[j], s2[j]);
     }
 
-    if( isFDim && ftz )
-        RestoreFPState( &oldMode );
+    if (isFDim && ftz) RestoreFPState(&oldMode);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    if (!skipVerification) {
-        //Verify data
+    if (!skipVerification)
+    {
+        // Verify data
         t = (cl_uint *)r;
-        for( j = 0; j < buffer_elements; j++ )
+        for (j = 0; j < buffer_elements; j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 cl_uint *q = out[k];
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    float test = ((float*) q)[j];
-                    double correct = ref_func( s[j], s2[j] );
-
-                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                    // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                    // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                    // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                    float test = ((float *)q)[j];
+                    double correct = ref_func(s[j], s2[j]);
+
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow As per
+                    // OpenCL 2.0 spec, section 5.8.4.3, enabling
+                    // fast-relaxed-math mode also enables -cl-finite-math-only
+                    // optimization. This optimization allows to assume that
+                    // arguments and results are not NaNs or +/-INFs. Hence,
+                    // accept any result if inputs or results are NaNs or INFs.
                     if (relaxedMode || skipNanInf)
                     {
-                        if( skipNanInf && overflow[j])
-                            continue;
-                        // Note: no double rounding here.  Reference functions calculate in single precision.
-                        if( IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                            IsFloatInfinity(s2[j])   || IsFloatNaN(s2[j])       ||
-                            IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        )
+                        if (skipNanInf && overflow[j]) continue;
+                        // Note: no double rounding here.  Reference functions
+                        // calculate in single precision.
+                        if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                            || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
+                            || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
                             continue;
                     }
 
-                    float err = Ulp_Error( test, correct );
-                    int fail = ! (fabsf(err) <= ulps);
+                    float err = Ulp_Error(test, correct);
+                    int fail = !(fabsf(err) <= ulps);
 
-                    if( fail && ftz )
+                    if (fail && ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsFloatResultSubnormal(correct, ulps ) )
+                        if (IsFloatResultSubnormal(correct, ulps))
                         {
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // nextafter on FTZ platforms may return the smallest
@@ -765,171 +976,203 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
                         // normal number is the next representable number.
                         // In which case, it should have the same sign as the
                         // second argument.
-                        if (isNextafter )
+                        if (isNextafter)
                         {
-                            if(IsFloatSubnormal(s[j]) || s[j] == 0.0f)
+                            if (IsFloatSubnormal(s[j]) || s[j] == 0.0f)
                             {
                                 float value = copysignf(twoToMinus126, s2[j]);
                                 fail = fail && (test != value);
-                                if (!fail)
-                                    err = 0.0f;
+                                if (!fail) err = 0.0f;
                             }
                         }
                         else
                         {
                             // retry per section 6.5.3.3
-                            if( IsFloatSubnormal( s[j] ) )
+                            if (IsFloatSubnormal(s[j]))
                             {
                                 double correct2, correct3;
                                 float err2, err3;
 
-                                if( skipNanInf )
-                                    feclearexcept(FE_OVERFLOW);
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                correct2 = ref_func( 0.0, s2[j] );
-                                correct3 = ref_func( -0.0, s2[j] );
+                                correct2 = ref_func(0.0, s2[j]);
+                                correct3 = ref_func(-0.0, s2[j]);
 
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                                // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                                // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow As per OpenCL 2.0 spec,
+                                // section 5.8.4.3, enabling fast-relaxed-math
+                                // mode also enables -cl-finite-math-only
+                                // optimization. This optimization allows to
+                                // assume that arguments and results are not
+                                // NaNs or +/-INFs. Hence, accept any result if
+                                // inputs or results are NaNs or INFs.
                                 if (relaxedMode || skipNanInf)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) && skipNanInf )
+                                    if (fetestexcept(FE_OVERFLOW) && skipNanInf)
                                         continue;
 
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                        IsFloatInfinity(correct3) || IsFloatNaN(correct3)    )
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
 
                                 // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                                if (IsFloatResultSubnormal(correct2, ulps)
+                                    || IsFloatResultSubnormal(correct3, ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
 
-                                //try with both args as zero
-                                if( IsFloatSubnormal( s2[j] )  )
+                                // try with both args as zero
+                                if (IsFloatSubnormal(s2[j]))
                                 {
                                     double correct4, correct5;
                                     float err4, err5;
 
-                                    if( skipNanInf )
-                                        feclearexcept(FE_OVERFLOW);
-
-                                    correct2 = ref_func( 0.0, 0.0 );
-                                    correct3 = ref_func( -0.0, 0.0 );
-                                    correct4 = ref_func( 0.0, -0.0 );
-                                    correct5 = ref_func( -0.0, -0.0 );
-
-                                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                    // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                                    // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                                    // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                                    if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                    correct2 = ref_func(0.0, 0.0);
+                                    correct3 = ref_func(-0.0, 0.0);
+                                    correct4 = ref_func(0.0, -0.0);
+                                    correct5 = ref_func(-0.0, -0.0);
+
+                                    // Per section 10 paragraph 6, accept any
+                                    // result if an input or output is a
+                                    // infinity or NaN or overflow As per
+                                    // OpenCL 2.0 spec, section 5.8.4.3,
+                                    // enabling fast-relaxed-math mode also
+                                    // enables -cl-finite-math-only
+                                    // optimization. This optimization allows to
+                                    // assume that arguments and results are not
+                                    // NaNs or +/-INFs. Hence, accept any result
+                                    // if inputs or results are NaNs or INFs.
                                     if (relaxedMode || skipNanInf)
                                     {
-                                        if( fetestexcept(FE_OVERFLOW) && skipNanInf )
+                                        if (fetestexcept(FE_OVERFLOW)
+                                            && skipNanInf)
                                             continue;
 
-                                        // Note: no double rounding here.  Reference functions calculate in single precision.
-                                        if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                            IsFloatInfinity(correct3) || IsFloatNaN(correct3)   ||
-                                            IsFloatInfinity(correct4) || IsFloatNaN(correct4)   ||
-                                            IsFloatInfinity(correct5) || IsFloatNaN(correct5)    )
+                                        // Note: no double rounding here.
+                                        // Reference functions calculate in
+                                        // single precision.
+                                        if (IsFloatInfinity(correct2)
+                                            || IsFloatNaN(correct2)
+                                            || IsFloatInfinity(correct3)
+                                            || IsFloatNaN(correct3)
+                                            || IsFloatInfinity(correct4)
+                                            || IsFloatNaN(correct4)
+                                            || IsFloatInfinity(correct5)
+                                            || IsFloatNaN(correct5))
                                             continue;
                                     }
 
-                                    err2 = Ulp_Error( test, correct2  );
-                                    err3 = Ulp_Error( test, correct3  );
-                                    err4 = Ulp_Error( test, correct4  );
-                                    err5 = Ulp_Error( test, correct5  );
-                                    fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                                     (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
+                                    err2 = Ulp_Error(test, correct2);
+                                    err3 = Ulp_Error(test, correct3);
+                                    err4 = Ulp_Error(test, correct4);
+                                    err5 = Ulp_Error(test, correct5);
+                                    fail = fail
+                                        && ((!(fabsf(err2) <= ulps))
+                                            && (!(fabsf(err3) <= ulps))
+                                            && (!(fabsf(err4) <= ulps))
+                                            && (!(fabsf(err5) <= ulps)));
+                                    if (fabsf(err2) < fabsf(err)) err = err2;
+                                    if (fabsf(err3) < fabsf(err)) err = err3;
+                                    if (fabsf(err4) < fabsf(err)) err = err4;
+                                    if (fabsf(err5) < fabsf(err)) err = err5;
 
                                     // retry per section 6.5.3.4
-                                    if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ||
-                                        IsFloatResultSubnormal( correct4, ulps ) || IsFloatResultSubnormal( correct5, ulps ) )
+                                    if (IsFloatResultSubnormal(correct2, ulps)
+                                        || IsFloatResultSubnormal(correct3,
+                                                                  ulps)
+                                        || IsFloatResultSubnormal(correct4,
+                                                                  ulps)
+                                        || IsFloatResultSubnormal(correct5,
+                                                                  ulps))
                                     {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
+                                        fail = fail && (test != 0.0f);
+                                        if (!fail) err = 0.0f;
                                     }
                                 }
                             }
-                            else if(IsFloatSubnormal(s2[j]) )
+                            else if (IsFloatSubnormal(s2[j]))
                             {
                                 double correct2, correct3;
                                 float err2, err3;
 
-                                if( skipNanInf )
-                                    feclearexcept(FE_OVERFLOW);
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                correct2 = ref_func( s[j], 0.0 );
-                                correct3 = ref_func( s[j], -0.0 );
+                                correct2 = ref_func(s[j], 0.0);
+                                correct3 = ref_func(s[j], -0.0);
 
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                                // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                                // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow As per OpenCL 2.0 spec,
+                                // section 5.8.4.3, enabling fast-relaxed-math
+                                // mode also enables -cl-finite-math-only
+                                // optimization. This optimization allows to
+                                // assume that arguments and results are not
+                                // NaNs or +/-INFs. Hence, accept any result if
+                                // inputs or results are NaNs or INFs.
                                 if (relaxedMode || skipNanInf)
                                 {
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( overflow[j] && skipNanInf)
-                                        continue;
-
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                        IsFloatInfinity(correct3) || IsFloatNaN(correct3)    )
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (overflow[j] && skipNanInf) continue;
+
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
 
                                 // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                                if (IsFloatResultSubnormal(correct2, ulps)
+                                    || IsFloatResultSubnormal(correct3, ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
                     }
 
-                    if( fabsf(err ) > tinfo->maxError )
+                    if (fabsf(err) > tinfo->maxError)
                     {
                         tinfo->maxError = fabsf(err);
                         tinfo->maxErrorValue = s[j];
                         tinfo->maxErrorValue2 = s2[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a (0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n", name, sizeNames[k], err, s[j], ((cl_uint*)s)[j], s2[j], ((cl_uint*)s2)[j], r[j], test, ((cl_uint*)&test)[0], j );
+                        vlog_error(
+                            "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a "
+                            "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n",
+                            name, sizeNames[k], err, s[j], ((cl_uint *)s)[j],
+                            s2[j], ((cl_uint *)s2)[j], r[j], test,
+                            ((cl_uint *)&test)[0], j);
                         error = -1;
                         goto exit;
                     }
@@ -938,93 +1181,192 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
         }
     }
 
-    if (isFDim && gIsInRTZMode)
-        (void)set_round(oldRoundMode, kfloat);
+    if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements,  job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 
 
 exit:
-    if( overflow )
-        free( overflow );
+    if (overflow) free(overflow);
     return error;
-
 }
 
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
-
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
                                          int isNextafter, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
 
-    if (gWimpyMode){
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -1036,59 +1378,79 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    test_info.isFDim = 0 == strcmp( "fdim", f->nameInCode );
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
     test_info.skipNanInf = 0;
     test_info.isNextafter = isNextafter;
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
@@ -1101,18 +1463,20 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -1120,300 +1484,346 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-       vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            free_mtdata( test_info.tinfo[i].d );
+            free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    dptr        func = job->f->dfunc;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-
-    int         isNextafter = job->isNextafter;
-    cl_ulong    *t;
-    cl_double   *r,*s,*s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+
+    int isNextafter = job->isNextafter;
+    cl_ulong *t;
+    cl_double *r, *s, *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_double *fp2 = (cl_double *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesDouble[x];
             fp2[j] = specialValuesDouble[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesDoubleCount )
-                    break;
+                if (y >= specialValuesDoubleCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int64(d);
         p2[j] = genrand_int64(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_ff( s[j], s2[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsDoubleResultSubnormal(correct, ulps ) )
+                    if (IsDoubleResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // nextafter on FTZ platforms may return the smallest
@@ -1426,103 +1836,113 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
                     // normal number is the next representable number.
                     // In which case, it should have the same sign as the
                     // second argument.
-                    if (isNextafter )
+                    if (isNextafter)
                     {
-                        if(IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
+                        if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
                         {
                             cl_double value = copysign(twoToMinus1022, s2[j]);
                             fail = fail && (test != value);
-                            if (!fail)
-                                err = 0.0f;
+                            if (!fail) err = 0.0f;
                         }
                     }
                     else
                     {
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
-                            long double correct2 = func.f_ff( 0.0, s2[j] );
-                            long double correct3 = func.f_ff( -0.0, s2[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 = func.f_ff(0.0, s2[j]);
+                            long double correct3 = func.f_ff(-0.0, s2[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with both args as zero
-                            if( IsDoubleSubnormal( s2[j] )  )
+                            // try with both args as zero
+                            if (IsDoubleSubnormal(s2[j]))
                             {
-                                correct2 = func.f_ff( 0.0, 0.0 );
-                                correct3 = func.f_ff( -0.0, 0.0 );
-                                long double correct4 = func.f_ff( 0.0, -0.0 );
-                                long double correct5 = func.f_ff( -0.0, -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                                 (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = func.f_ff(0.0, 0.0);
+                                correct3 = func.f_ff(-0.0, 0.0);
+                                long double correct4 = func.f_ff(0.0, -0.0);
+                                long double correct5 = func.f_ff(-0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps))
+                                        && (!(fabsf(err4) <= ulps))
+                                        && (!(fabsf(err5) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ||
-                                    IsDoubleResultSubnormal( correct4, ulps ) || IsDoubleResultSubnormal( correct5, ulps ) )
+                                if (IsDoubleResultSubnormal(correct2, ulps)
+                                    || IsDoubleResultSubnormal(correct3, ulps)
+                                    || IsDoubleResultSubnormal(correct4, ulps)
+                                    || IsDoubleResultSubnormal(correct5, ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if(IsDoubleSubnormal(s2[j]) )
+                        else if (IsDoubleSubnormal(s2[j]))
                         {
-                            long double correct2 = func.f_ff( s[j], 0.0 );
-                            long double correct3 = func.f_ff( s[j], -0.0 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 = func.f_ff(s[j], 0.0);
+                            long double correct3 = func.f_ff(s[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%.13la, %.13la}: *%.13la vs. %.13la\n", name, sizeNames[k], err, s[j], s2[j], r[j], test );
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
+                               "%.13la}: *%.13la vs. %.13la\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j],
+                               test);
                     error = -1;
                     goto exit;
                 }
@@ -1530,33 +1950,37 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements,  job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 exit:
     return error;
-
 }
 
 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
@@ -1580,4 +2004,3 @@ int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d,
 {
     return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode);
 }
-
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index abcb1b0015..f6ba838a6d 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -38,63 +38,85 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                        int vectorSize, cl_uint kernel_count, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   out[i] =  in1[i] ", operator_symbol, " in2[i];\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       f0 = f0 ", operator_symbol, " f1;\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = f0 ", operator_symbol, " f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void ",
+                        name,
+                        "_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] =  in1[i] ",
+                        operator_symbol,
+                        " in2[i];\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void ",
+        name,
+        "_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = f0 ",
+        operator_symbol,
+        " f1;\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = f0 ",
+        operator_symbol,
+        " f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "%s_kernel%s", name, sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -104,65 +126,87 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
                              int vectorSize, cl_uint kernel_count, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   out[i] =  in1[i] ", operator_symbol, " in2[i];\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-                            "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       d0 = d0 ", operator_symbol, " d1;\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = d0 ", operator_symbol, " d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void ",
+                        name,
+                        "_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] =  in1[i] ",
+                        operator_symbol,
+                        " in2[i];\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+        "__kernel void ",
+        name,
+        "_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "%s_kernel%s", name, sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -170,114 +214,214 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *name;
-    const char  *operator_symbol;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *name;
+    const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->name, info->operator_symbol, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->name, info->operator_symbol, i,
                              info->kernel_count, info->kernels[i],
                              info->programs + i, info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value (param 1).  Init to 0.
-    double      maxErrorValue2;                     // position of the max error value (param 2).  Init to 0.
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
     bool relaxedMode; // True if the test is being run in relaxed mode, false
                       // otherwise.
 
     // no special fields
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
-
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                                         bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
-    if (gWimpyMode) {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
     test_info.step = test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -287,59 +431,80 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     test_info.relaxedMode = relaxedMode;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -355,18 +520,20 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                                        f->name,
                                        f->nameInCode,
                                        relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -374,110 +541,130 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
             p2[j] = 0x3fc00000;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
@@ -485,12 +672,12 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
 
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    fptr        func = job->f->func;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     if (relaxedMode)
@@ -499,74 +686,77 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
 
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    cl_uchar    *overflow = (cl_uchar*)malloc(buffer_size);
-    const char  *name = job->f->name;
-    cl_uint     *t;
-    cl_float    *r,*s,*s2;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    const char *name = job->f->name;
+    cl_uint *t;
+    cl_float *r, *s, *s2;
     RoundingMode oldRoundMode;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
 
-    if( job_id <= (cl_uint)indx ) {
+    if (job_id <= (cl_uint)indx)
+    {
         // Insert special values
         uint32_t x, y;
 
         x = (job_id * buffer_elements) % specialValuesFloatCount;
         y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ ) {
+        for (; j < buffer_elements; j++)
+        {
             p[j] = ((cl_uint *)specialValuesFloat)[x];
             p2[j] = ((cl_uint *)specialValuesFloat)[y];
             ++x;
-            if (x >= specialValuesFloatCount) {
+            if (x >= specialValuesFloatCount)
+            {
                 x = 0;
                 y++;
-                if (y >= specialValuesFloatCount)
-                    break;
+                if (y >= specialValuesFloatCount) break;
             }
             if (relaxedMode && strcmp(name, "divide") == 0)
             {
                 cl_uint pj = p[j] & 0x7fffffff;
                 cl_uint p2j = p2[j] & 0x7fffffff;
                 // Replace values outside [2^-62, 2^62] with QNaN
-                if (pj < 0x20800000 || pj > 0x5e800000)
-                    p[j] = 0x7fc00000;
-                if (p2j < 0x20800000 || p2j > 0x5e800000)
-                    p2[j] = 0x7fc00000;
+                if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000;
+                if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000;
             }
         }
     }
 
     // Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int32(d);
         p2[j] = genrand_int32(d);
@@ -576,316 +766,353 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             cl_uint pj = p[j] & 0x7fffffff;
             cl_uint p2j = p2[j] & 0x7fffffff;
             // Replace values outside [2^-62, 2^62] with QNaN
-            if (pj < 0x20800000 || pj > 0x5e800000)
-                p[j] = 0x7fc00000;
-            if (p2j < 0x20800000 || p2j > 0x5e800000)
-                p2[j] = 0x7fc00000;
+            if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000;
+            if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000;
         }
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
+    if (gSkipCorrectnessTesting)
     {
-        free( overflow );
+        free(overflow);
         return CL_SUCCESS;
     }
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     FPU_mode_type oldMode;
-    memset( &oldMode, 0, sizeof( oldMode ) );
-    if( ftz )
-        ForceFTZ( &oldMode );
+    memset(&oldMode, 0, sizeof(oldMode));
+    if (ftz) ForceFTZ(&oldMode);
 
     // Set the rounding mode to match the device
     oldRoundMode = kRoundToNearestEven;
-    if (gIsInRTZMode)
-        oldRoundMode = set_round(kRoundTowardZero, kfloat);
-
-    //Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (float *)gIn2  + thread_id * buffer_elements;
-    if( gInfNanSupport )
+    if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
+
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    if (gInfNanSupport)
     {
-        for( j = 0; j < buffer_elements; j++ )
-            r[j] = (float) func.f_ff( s[j], s2[j] );
+        for (j = 0; j < buffer_elements; j++)
+            r[j] = (float)func.f_ff(s[j], s2[j]);
     }
     else
     {
-        for( j = 0; j < buffer_elements; j++ )
+        for (j = 0; j < buffer_elements; j++)
         {
             feclearexcept(FE_OVERFLOW);
-            r[j] = (float) func.f_ff( s[j], s2[j] );
-            overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            r[j] = (float)func.f_ff(s[j], s2[j]);
+            overflow[j] =
+                FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
         }
     }
 
-    if (gIsInRTZMode)
-      (void)set_round(oldRoundMode, kfloat);
+    if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
 
-    if( ftz )
-        RestoreFPState( &oldMode );
+    if (ftz) RestoreFPState(&oldMode);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_uint *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_uint *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                float test = ((float*) q)[j];
-                double correct = func.f_ff( s[j], s2[j] );
+                float test = ((float *)q)[j];
+                double correct = func.f_ff(s[j], s2[j]);
 
-                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                if ( !gInfNanSupport)
+                // Per section 10 paragraph 6, accept any result if an input or
+                // output is a infinity or NaN or overflow
+                if (!gInfNanSupport)
                 {
-                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                    if( overflow[j]                                         ||
-                        IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                        IsFloatInfinity(s2[j])   || IsFloatNaN(s2[j])       ||
-                        IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        )
+                    // Note: no double rounding here.  Reference functions
+                    // calculate in single precision.
+                    if (overflow[j] || IsFloatInfinity(correct)
+                        || IsFloatNaN(correct) || IsFloatInfinity(s2[j])
+                        || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j])
+                        || IsFloatNaN(s[j]))
                         continue;
                 }
 
-        // Per section 10 paragraph 6, accept embedded devices always returning positive 0.0.
-        if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000)) continue;
+                // Per section 10 paragraph 6, accept embedded devices always
+                // returning positive 0.0.
+                if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000))
+                    continue;
 
-                float err = Ulp_Error( test, correct );
-                float errB = Ulp_Error( test, (float) correct  );
+                float err = Ulp_Error(test, correct);
+                float errB = Ulp_Error(test, (float)correct);
 
-                int fail = ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
-                if( fabsf( errB ) < fabsf(err ) )
-                  err = errB;
+                int fail =
+                    ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
+                if (fabsf(errB) < fabsf(err)) err = errB;
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsFloatResultSubnormal(correct, ulps ) )
+                    if (IsFloatResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // retry per section 6.5.3.3
-                    if( IsFloatSubnormal( s[j] ) )
+                    if (IsFloatSubnormal(s[j]))
                     {
                         double correct2, correct3;
                         float err2, err3;
 
-                        if( !gInfNanSupport )
-                            feclearexcept(FE_OVERFLOW);
+                        if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
 
-                        correct2 = func.f_ff( 0.0, s2[j] );
-                        correct3 = func.f_ff( -0.0, s2[j] );
+                        correct2 = func.f_ff(0.0, s2[j]);
+                        correct3 = func.f_ff(-0.0, s2[j]);
 
-                        // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                        if( !gInfNanSupport )
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
                         {
-                            if( fetestexcept(FE_OVERFLOW) )
-                                continue;
-
-                            // Note: no double rounding here.  Reference functions calculate in single precision.
-                            if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                IsFloatInfinity(correct3) || IsFloatNaN(correct3)    )
+                            if (fetestexcept(FE_OVERFLOW)) continue;
+
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2)
+                                || IsFloatInfinity(correct3)
+                                || IsFloatNaN(correct3))
                                 continue;
                         }
 
-                        err2 = Ulp_Error( test, correct2  );
-                        err3 = Ulp_Error( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
-                        //try with both args as zero
-                        if( IsFloatSubnormal( s2[j] )  )
+                        // try with both args as zero
+                        if (IsFloatSubnormal(s2[j]))
                         {
                             double correct4, correct5;
                             float err4, err5;
 
-                            if( !gInfNanSupport )
-                                feclearexcept(FE_OVERFLOW);
+                            if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = func.f_ff( 0.0, 0.0 );
-                            correct3 = func.f_ff( -0.0, 0.0 );
-                            correct4 = func.f_ff( 0.0, -0.0 );
-                            correct5 = func.f_ff( -0.0, -0.0 );
+                            correct2 = func.f_ff(0.0, 0.0);
+                            correct3 = func.f_ff(-0.0, 0.0);
+                            correct4 = func.f_ff(0.0, -0.0);
+                            correct5 = func.f_ff(-0.0, -0.0);
 
-                            // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                            if( !gInfNanSupport )
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (!gInfNanSupport)
                             {
-                                if( fetestexcept(FE_OVERFLOW) )
-                                    continue;
-
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                    IsFloatInfinity(correct3) || IsFloatNaN(correct3)   ||
-                                    IsFloatInfinity(correct4) || IsFloatNaN(correct4)   ||
-                                    IsFloatInfinity(correct5) || IsFloatNaN(correct5)    )
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3)
+                                    || IsFloatInfinity(correct4)
+                                    || IsFloatNaN(correct4)
+                                    || IsFloatInfinity(correct5)
+                                    || IsFloatNaN(correct5))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            err4 = Ulp_Error( test, correct4  );
-                            err5 = Ulp_Error( test, correct5  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                             (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( fabsf( err4 ) < fabsf(err ) )
-                                err = err4;
-                            if( fabsf( err5 ) < fabsf(err ) )
-                                err = err5;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            err4 = Ulp_Error(test, correct4);
+                            err5 = Ulp_Error(test, correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
 
                             // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ||
-                                IsFloatResultSubnormal( correct4, ulps ) || IsFloatResultSubnormal( correct5, ulps ) )
+                            if (IsFloatResultSubnormal(correct2, ulps)
+                                || IsFloatResultSubnormal(correct3, ulps)
+                                || IsFloatResultSubnormal(correct4, ulps)
+                                || IsFloatResultSubnormal(correct5, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    else if(IsFloatSubnormal(s2[j]) )
+                    else if (IsFloatSubnormal(s2[j]))
                     {
                         double correct2, correct3;
                         float err2, err3;
 
-                        if( !gInfNanSupport )
-                            feclearexcept(FE_OVERFLOW);
+                        if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
 
-                        correct2 = func.f_ff( s[j], 0.0 );
-                        correct3 = func.f_ff( s[j], -0.0 );
+                        correct2 = func.f_ff(s[j], 0.0);
+                        correct3 = func.f_ff(s[j], -0.0);
 
-                        // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                        if ( !gInfNanSupport)
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
                         {
-                            // Note: no double rounding here.  Reference functions calculate in single precision.
-                            if( overflow[j]                                         ||
-                                IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                                IsFloatInfinity(correct2)|| IsFloatNaN(correct2)    )
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (overflow[j] || IsFloatInfinity(correct)
+                                || IsFloatNaN(correct)
+                                || IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2))
                                 continue;
                         }
 
-                        err2 = Ulp_Error( test, correct2  );
-                        err3 = Ulp_Error( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a (0x%8.8x) at index: %d\n", name, sizeNames[k], err, s[j], s2[j], r[j], test, ((cl_uint*)&test)[0], j );
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
+                               "vs. %a (0x%8.8x) at index: %d\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j], test,
+                               ((cl_uint *)&test)[0], j);
                     error = -1;
                     goto exit;
                 }
@@ -893,85 +1120,185 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step,  job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 exit:
-    if( overflow )
-        free( overflow );
+    if (overflow) free(overflow);
     return error;
 }
 
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
-
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                                            bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -983,56 +1310,76 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -1049,18 +1396,20 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                                        f->name,
                                        f->nameInCode,
                                        relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -1068,387 +1417,441 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    dptr        func = job->f->dfunc;
-    int         ftz = job->ftz;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_ulong    *t;
-    cl_double   *r,*s,*s2;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ulong *t;
+    cl_double *r, *s, *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_double *fp2 = (cl_double *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesDouble[x];
             fp2[j] = specialValuesDouble[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesDoubleCount )
-                    break;
+                if (y >= specialValuesDoubleCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int64(d);
         p2[j] = genrand_int64(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_ff( s[j], s2[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsDoubleResultSubnormal(correct, ulps ) )
+                    if (IsDoubleResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
 
                     // retry per section 6.5.3.3
-                    if( IsDoubleSubnormal( s[j] ) )
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        long double correct2 = func.f_ff( 0.0, s2[j] );
-                        long double correct3 = func.f_ff( -0.0, s2[j] );
-                        float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                        float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        long double correct2 = func.f_ff(0.0, s2[j]);
+                        long double correct3 = func.f_ff(-0.0, s2[j]);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
-                        //try with both args as zero
-                        if( IsDoubleSubnormal( s2[j] )  )
+                        // try with both args as zero
+                        if (IsDoubleSubnormal(s2[j]))
                         {
-                            correct2 = func.f_ff( 0.0, 0.0 );
-                            correct3 = func.f_ff( -0.0, 0.0 );
-                            long double correct4 = func.f_ff( 0.0, -0.0 );
-                            long double correct5 = func.f_ff( -0.0, -0.0 );
-                            err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                             (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( fabsf( err4 ) < fabsf(err ) )
-                                err = err4;
-                            if( fabsf( err5 ) < fabsf(err ) )
-                                err = err5;
+                            correct2 = func.f_ff(0.0, 0.0);
+                            correct3 = func.f_ff(-0.0, 0.0);
+                            long double correct4 = func.f_ff(0.0, -0.0);
+                            long double correct5 = func.f_ff(-0.0, -0.0);
+                            err2 = Bruteforce_Ulp_Error_Double(test, correct2);
+                            err3 = Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err4 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            float err5 =
+                                Bruteforce_Ulp_Error_Double(test, correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ||
-                                IsDoubleResultSubnormal( correct4, ulps ) || IsDoubleResultSubnormal( correct5, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps)
+                                || IsDoubleResultSubnormal(correct4, ulps)
+                                || IsDoubleResultSubnormal(correct5, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    else if(IsDoubleSubnormal(s2[j]) )
+                    else if (IsDoubleSubnormal(s2[j]))
                     {
-                        long double correct2 = func.f_ff( s[j], 0.0 );
-                        long double correct3 = func.f_ff( s[j], -0.0 );
-                        float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                        float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        long double correct2 = func.f_ff(s[j], 0.0);
+                        long double correct3 = func.f_ff(s[j], -0.0);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n", name, sizeNames[k], err, s[j], s2[j], r[j], test );
+                    vlog_error(
+                        "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
+                        name, sizeNames[k], err, s[j], s2[j], r[j], test);
                     error = -1;
                     goto exit;
                 }
@@ -1456,36 +1859,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements,  job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 
 exit:
     return error;
-
 }
-
-
-
-
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index 01f45242b8..dc6feb8c82 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -34,64 +34,83 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global int", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global int* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
-                            "       f0 = ", name, "( f0, i0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0;\n"
-                            "       int3 i0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global int",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global int* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       int3 i0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -101,66 +120,85 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
 {
-    const char *c[] = {     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global int", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global int* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
-                            "       d0 = ", name, "( d0, i0 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0;\n"
-                            "       int3 i0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = ", name, "( d0, i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global int",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global int* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, i0 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       int3 i0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -168,27 +206,31 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
@@ -198,85 +240,185 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
-
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 
-static const int specialValuesInt[] = { 0, 1, 2, 3, 126, 127, 128, 0x02000001, 0x04000001, 1465264071, 1488522147,
-                                            -1, -2, -3, -126, -127, -128, -0x02000001, -0x04000001, -1465264071, -1488522147 };
-static size_t specialValuesIntCount = sizeof( specialValuesInt ) / sizeof( specialValuesInt[0] );
+static const int specialValuesInt[] = {
+    0,           1,           2,          3,          126,        127,
+    128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
+    -2,          -3,          -126,       -127,       -128,       -0x02000001,
+    -0x04000001, -1465264071, -1488522147
+};
+static size_t specialValuesIntCount =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value (param 1).  Init to 0.
-    cl_int      maxErrorValue2;                     // position of the max error value (param 2).  Init to 0.
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
 
     // no special values
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    cl_int      maxErrorVal2 = 0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -286,59 +428,82 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        cl_buffer_region region2 = { i * test_info.subBufferSize * sizeof( cl_int), test_info.subBufferSize * sizeof( cl_int) };
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
@@ -350,18 +515,20 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
     // Run the kernels
-    error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+    error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
 
     // Accumulate the arithmetic errors
-    for( i = 0; i < test_info.threadCount; i++ )
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        if( test_info.tinfo[i].maxError > maxError )
+        if (test_info.tinfo[i].maxError > maxError)
         {
             maxError = test_info.tinfo[i].maxError;
             maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -369,331 +536,377 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         }
     }
 
-    if( error )
-        goto exit;
+    if (error) goto exit;
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000;
             p2[j] = 3;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    fptr        func = job->f->func;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_uint     *t;
-    cl_float    *r,*s;
-    cl_int      *s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_uint *t;
+    cl_float *r, *s;
+    cl_int *s2;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesIntCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesIntCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         float *fp = (float *)p;
         cl_int *ip2 = (cl_int *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesFloatCount;
-    y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
-          fp[j] = specialValuesFloat[x];
-          ip2[j] = specialValuesInt[y];
-            if( ++x >= specialValuesFloatCount )
+            fp[j] = specialValuesFloat[x];
+            ip2[j] = specialValuesInt[y];
+            if (++x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesIntCount )
-                    break;
+                if (y >= specialValuesIntCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
-      p[j] = genrand_int32(d);
-      p2[j] = genrand_int32(d);
+        p[j] = genrand_int32(d);
+        p2[j] = genrand_int32(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_int *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (float) func.f_fi( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_uint *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_uint *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                float test = ((float*) q)[j];
-                double correct = func.f_fi( s[j], s2[j] );
-                float err = Ulp_Error( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                float test = ((float *)q)[j];
+                double correct = func.f_fi(s[j], s2[j]);
+                float err = Ulp_Error(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsFloatResultSubnormal(correct, ulps ) )
+                    if (IsFloatResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // retry per section 6.5.3.3
-                    if( IsFloatSubnormal( s[j] ) )
+                    if (IsFloatSubnormal(s[j]))
                     {
                         double correct2, correct3;
                         float err2, err3;
-                        correct2 = func.f_fi( 0.0, s2[j] );
-                        correct3 = func.f_fi( -0.0, s2[j] );
-                        err2 = Ulp_Error( test, correct2  );
-                        err3 = Ulp_Error( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        correct2 = func.f_fi(0.0, s2[j]);
+                        correct3 = func.f_fi(-0.0, s2[j]);
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
                     vlog_error(
                         "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
@@ -708,89 +921,191 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step,  job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 
 exit:
     return error;
-
 }
 
 
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
-
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-static const int specialValuesInt2[] = { 0, 1, 2, 3, 1022, 1023, 1024, INT_MIN, INT_MAX,
-                                            -1, -2, -3, -1022, -1023, -11024, -INT_MAX };
-static size_t specialValuesInt2Count = sizeof( specialValuesInt ) / sizeof( specialValuesInt[0] );
+static const int specialValuesInt2[] = { 0,       1,     2,      3,
+                                         1022,    1023,  1024,   INT_MIN,
+                                         INT_MAX, -1,    -2,     -3,
+                                         -1022,   -1023, -11024, -INT_MAX };
+static size_t specialValuesInt2Count =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    cl_int      maxErrorVal2 = 0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -802,59 +1117,82 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        cl_buffer_region region2 = { i * test_info.subBufferSize * sizeof( cl_int), test_info.subBufferSize * sizeof( cl_int) };
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -868,19 +1206,21 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
     // Run the kernels
-    if( !gSkipCorrectnessTesting )
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+    if (!gSkipCorrectnessTesting)
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
 
     // Accumulate the arithmetic errors
-    for( i = 0; i < test_info.threadCount; i++ )
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        if( test_info.tinfo[i].maxError > maxError )
+        if (test_info.tinfo[i].maxError > maxError)
         {
             maxError = test_info.tinfo[i].maxError;
             maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -888,334 +1228,386 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         }
     }
 
-    if( error )
-        goto exit;
+    if (error) goto exit;
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         double *p = (double *)gIn;
         cl_int *p2 = (cl_int *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = 3;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE/2, gIn2, 0, NULL, NULL) ))
+        if ((error =
+                 clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                      BUFFER_SIZE / 2, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    dptr        func = job->f->dfunc;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_ulong    *t;
-    cl_double   *r,*s;
-    cl_int      *s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ulong *t;
+    cl_double *r, *s;
+    cl_int *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesInt2Count;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesInt2Count;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_int *ip2 = (cl_int *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesDouble[x];
             ip2[j] = specialValuesInt2[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesInt2Count )
-                    break;
+                if (y >= specialValuesInt2Count) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = DoubleFromUInt32(genrand_int32(d));
         p2[j] = genrand_int32(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size/2, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size / 2, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_int *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_fi( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_fi(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_fi( s[j], s2[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_fi(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsDoubleResultSubnormal(correct, ulps ) )
+                    if (IsDoubleResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // retry per section 6.5.3.3
-                    if( IsDoubleSubnormal( s[j] ) )
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        long double correct2 = func.f_fi( 0.0, s2[j] );
-                        long double correct3 = func.f_fi( -0.0, s2[j] );
-                        float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                        float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        long double correct2 = func.f_fi(0.0, s2[j]);
+                        long double correct3 = func.f_fi(-0.0, s2[j]);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%.13la, %d}: *%.13la vs. %.13la\n", name, sizeNames[k], err, s[j], s2[j], r[j], test );
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, %d}: "
+                               "*%.13la vs. %.13la\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j],
+                               test);
                     error = -1;
                     goto exit;
                 }
@@ -1223,35 +1615,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-          vlog("." );
-       }
-       fflush(stdout);
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
 exit:
     return error;
-
 }
-
-
-
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index af1b04d1dc..5065b280df 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -36,68 +36,90 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], out2 + i );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in, __global float* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       f0 = ", name, "( f0, f1, &i0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( i0, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       f0 = ", name, "( f0, f1, &i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in, "
+        "__global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -106,95 +128,121 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], out2 + i );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in, __global double* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       d0 = ", name, "( d0, d1, &i0 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "       vstore3( i0, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       d0 = ", name, "( d0, d1, &i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               out2[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               out2[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global int* out2, __global double* in, "
+        "__global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -207,7 +255,7 @@ typedef struct ComputeReferenceInfoF_
     const float *y;
     float *r;
     int *i;
-    double (*f_ffpI)(double, double, int*);
+    double (*f_ffpI)(double, double, int *);
     cl_uint lim;
     cl_uint count;
 } ComputeReferenceInfoF;
@@ -218,13 +266,12 @@ typedef struct ComputeReferenceInfoD_
     const double *y;
     double *r;
     int *i;
-    long double (*f_ffpI)(long double, long double, int*);
+    long double (*f_ffpI)(long double, long double, int *);
     cl_uint lim;
     cl_uint count;
 } ComputeReferenceInfoD;
 
-static cl_int
-ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
     cl_uint lim = cri->lim;
@@ -237,17 +284,15 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
     double (*f)(double, double, int *) = cri->f_ffpI;
     cl_uint j;
 
-    if (off + count > lim)
-    count = lim - off;
+    if (off + count > lim) count = lim - off;
 
     for (j = 0; j < count; ++j)
-    r[j] = (float)f((double)x[j], (double)y[j], i + j);
+        r[j] = (float)f((double)x[j], (double)y[j], i + j);
 
     return CL_SUCCESS;
 }
 
-static cl_int
-ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
     cl_uint lim = cri->lim;
@@ -260,13 +305,12 @@ ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     long double (*f)(long double, long double, int *) = cri->f_ffpI;
     cl_uint j;
 
-    if (off + count > lim)
-    count = lim - off;
+    if (off + count > lim) count = lim - off;
 
     Force64BitFPUPrecision();
 
     for (j = 0; j < count; ++j)
-    r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
+        r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
 
     return CL_SUCCESS;
 }
@@ -278,15 +322,15 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     float float_ulps;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
 #if defined PARALLEL_REFERENCE
@@ -294,7 +338,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 #endif
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded )
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -305,392 +349,480 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         cl_uint *p = (cl_uint *)gIn;
         cl_uint *p2 = (cl_uint *)gIn2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
         // Calculate the correctly rounded reference result
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
 
 #if defined PARALLEL_REFERENCE
-    if (threadCount > 1) {
-        ComputeReferenceInfoF cri;
-        cri.x = s;
-        cri.y = s2;
-        cri.r = (float *)gOut_Ref;
-        cri.i = (int *)gOut_Ref2;
-        cri.f_ffpI = f->func.f_ffpI;
-        cri.lim = bufferSize / sizeof( float );
-        cri.count = (cri.lim + threadCount - 1) / threadCount;
-        ThreadPool_Do(ReferenceF, threadCount, &cri);
-    } else {
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoF cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (float *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->func.f_ffpI;
+            cri.lim = bufferSize / sizeof(float);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceF, threadCount, &cri);
+        }
+        else
+        {
 #endif
             float *r = (float *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                r[j] = (float) f->func.f_ffpI( s[j], s2[j], r2+j );
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
 #if defined PARALLEL_REFERENCE
-    }
+        }
 #endif
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)gOut[k];
                 int32_t *q2 = (int32_t *)gOut2[k];
 
                 // Check for exact match to correctly rounded result
-        if (t[j] == q[j] && t2[j] == q2[j])
-            continue;
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
 
-        // Check for paired NaNs
-        if ((t[j] & 0x7fffffff) > 0x7f800000 && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
-            continue;
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffff) > 0x7f800000
+                    && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
+                    continue;
 
                 // if( t[j] != q[j] || t2[j] != q2[j] )
                 {
-                    float test = ((float*) q)[j];
+                    float test = ((float *)q)[j];
                     int correct2 = INT_MIN;
-                    double correct = f->func.f_ffpI( s[j], s2[j], &correct2 );
-                    float err = Ulp_Error( test, correct );
+                    double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
+                    float err = Ulp_Error(test, correct);
                     int64_t iErr;
 
-                    // in case of remquo, we only care about the sign and last seven bits of
-                    // integer as per the spec.
-                    if(testingRemquo)
-                        iErr = (long long) (q2[j] & 0x0000007f) - (long long) (correct2 & 0x0000007f);
+                    // in case of remquo, we only care about the sign and last
+                    // seven bits of integer as per the spec.
+                    if (testingRemquo)
+                        iErr = (long long)(q2[j] & 0x0000007f)
+                            - (long long)(correct2 & 0x0000007f);
                     else
-                        iErr = (long long) q2[j] - (long long) correct2;
-
-                    //For remquo, if y = 0, x is infinite, or either is NaN then the standard either neglects
-                    //to say what is returned in iptr or leaves it undefined or implementation defined.
-                    int iptrUndefined = fabs(((float*) gIn)[j]) == INFINITY ||
-                                        ((float*) gIn2)[j] == 0.0f          ||
-                                        isnan(((float*) gIn2)[j])           ||
-                                        isnan(((float*) gIn)[j]);
-                    if(iptrUndefined)
-                         iErr = 0;
-
-                    int fail = ! (fabsf(err) <= float_ulps && iErr == 0 );
-                    if( ftz && fail )
+                        iErr = (long long)q2[j] - (long long)correct2;
+
+                    // For remquo, if y = 0, x is infinite, or either is NaN
+                    // then the standard either neglects to say what is returned
+                    // in iptr or leaves it undefined or implementation defined.
+                    int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
+                        || ((float *)gIn2)[j] == 0.0f
+                        || isnan(((float *)gIn2)[j])
+                        || isnan(((float *)gIn)[j]);
+                    if (iptrUndefined) iErr = 0;
+
+                    int fail = !(fabsf(err) <= float_ulps && iErr == 0);
+                    if (ftz && fail)
                     {
                         // retry per section 6.5.3.2
-                        if( IsFloatResultSubnormal(correct, float_ulps ) )
+                        if (IsFloatResultSubnormal(correct, float_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
                             int correct3i, correct4i;
-                            double correct3 = f->func.f_ffpI( 0.0, s2[j], &correct3i );
-                            double correct4 = f->func.f_ffpI( -0.0, s2[j], &correct4i );
-                            float err2 = Ulp_Error( test, correct3  );
-                            float err3 = Ulp_Error( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            double correct3 =
+                                f->func.f_ffpI(0.0, s2[j], &correct3i);
+                            double correct4 =
+                                f->func.f_ffpI(-0.0, s2[j], &correct4i);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= float_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps ) )
+                            if (IsFloatResultSubnormal(correct2, float_ulps)
+                                || IsFloatResultSubnormal(correct3, float_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with both args as zero
-                            if( IsFloatSubnormal( s2[j] ) )
+                            // try with both args as zero
+                            if (IsFloatSubnormal(s2[j]))
                             {
                                 int correct7i, correct8i;
-                                correct3 = f->func.f_ffpI( 0.0, 0.0, &correct3i );
-                                correct4 = f->func.f_ffpI( -0.0, 0.0, &correct4i );
-                                double correct7 = f->func.f_ffpI( 0.0, -0.0, &correct7i );
-                                double correct8 = f->func.f_ffpI( -0.0, -0.0, &correct8i );
-                                err2 = Ulp_Error( test, correct3  );
-                                err3 = Ulp_Error( test, correct4  );
-                                float err4 = Ulp_Error( test, correct7  );
-                                float err5 = Ulp_Error( test, correct8  );
-                                iErr3 = (long long) q2[j] - (long long) correct3i;
-                                iErr4 = (long long) q2[j] - (long long) correct4i;
-                                int64_t iErr7 = (long long) q2[j] - (long long) correct7i;
-                                int64_t iErr8 = (long long) q2[j] - (long long) correct8i;
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps  && iErr4 == 0)) &&
-                                                 (!(fabsf(err4) <= float_ulps  && iErr7 == 0)) && (!(fabsf(err5) <= float_ulps  && iErr8 == 0)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-                                if( llabs(iErr3) < llabs( iErr ) )
-                                    iErr = iErr3;
-                                if( llabs(iErr4) < llabs( iErr ) )
-                                    iErr = iErr4;
-                                if( llabs(iErr7) < llabs( iErr ) )
-                                    iErr = iErr7;
-                                if( llabs(iErr8) < llabs( iErr ) )
-                                    iErr = iErr8;
+                                correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                                correct4 =
+                                    f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                                double correct7 =
+                                    f->func.f_ffpI(0.0, -0.0, &correct7i);
+                                double correct8 =
+                                    f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                                err2 = Ulp_Error(test, correct3);
+                                err3 = Ulp_Error(test, correct4);
+                                float err4 = Ulp_Error(test, correct7);
+                                float err5 = Ulp_Error(test, correct8);
+                                iErr3 = (long long)q2[j] - (long long)correct3i;
+                                iErr4 = (long long)q2[j] - (long long)correct4i;
+                                int64_t iErr7 =
+                                    (long long)q2[j] - (long long)correct7i;
+                                int64_t iErr8 =
+                                    (long long)q2[j] - (long long)correct8i;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps
+                                           && iErr3 == 0))
+                                        && (!(fabsf(err3) <= float_ulps
+                                              && iErr4 == 0))
+                                        && (!(fabsf(err4) <= float_ulps
+                                              && iErr7 == 0))
+                                        && (!(fabsf(err5) <= float_ulps
+                                              && iErr8 == 0)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+                                if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                                if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                                if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                                if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
 
                                 // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct3, float_ulps ) || IsFloatResultSubnormal(correct4, float_ulps )  ||
-                                    IsFloatResultSubnormal(correct7, float_ulps ) || IsFloatResultSubnormal(correct8, float_ulps ) )
+                                if (IsFloatResultSubnormal(correct3, float_ulps)
+                                    || IsFloatResultSubnormal(correct4,
+                                                              float_ulps)
+                                    || IsFloatResultSubnormal(correct7,
+                                                              float_ulps)
+                                    || IsFloatResultSubnormal(correct8,
+                                                              float_ulps))
                                 {
-                                    fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0 || iErr7 == 0 || iErr8 == 0));
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && (iErr3 == 0 || iErr4 == 0
+                                                 || iErr7 == 0 || iErr8 == 0));
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( IsFloatSubnormal( s2[j] ) )
+                        else if (IsFloatSubnormal(s2[j]))
                         {
                             int correct3i, correct4i;
-                            double correct3 = f->func.f_ffpI( s[j], 0.0, &correct3i );
-                            double correct4 = f->func.f_ffpI( s[j], -0.0, &correct4i );
-                            float err2 = Ulp_Error( test, correct3  );
-                            float err3 = Ulp_Error( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            double correct3 =
+                                f->func.f_ffpI(s[j], 0.0, &correct3i);
+                            double correct4 =
+                                f->func.f_ffpI(s[j], -0.0, &correct4i);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= float_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps ) )
+                            if (IsFloatResultSubnormal(correct2, float_ulps)
+                                || IsFloatResultSubnormal(correct3, float_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} ({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, 0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
-                                    f->name, sizeNames[k], err, iErr,
-                                   ((float*) gIn)[j], ((float*) gIn2)[j],
-                                   ((cl_uint*) gIn)[j], ((cl_uint*) gIn2)[j],
-                                   ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j],
-                                   ((cl_uint*) gOut_Ref)[j], ((cl_uint*) gOut_Ref2)[j],
-                                   test, q2[j],
-                                   ((cl_uint*)&test)[0], ((cl_uint*) q2)[j] );
-                      error = -1;
-                      goto exit;
+                        vlog_error(
+                            "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
+                            "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
+                            "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
+                            f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
+                            ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
+                            ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
+                            ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
+                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                            ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
-
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -704,14 +836,14 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
@@ -728,400 +860,504 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                    &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
         {
             return error;
         }
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
 
 #if defined PARALLEL_REFERENCE
-    if (threadCount > 1) {
-        ComputeReferenceInfoD cri;
-        cri.x = s;
-        cri.y = s2;
-        cri.r = (double *)gOut_Ref;
-        cri.i = (int *)gOut_Ref2;
-        cri.f_ffpI = f->dfunc.f_ffpI;
-        cri.lim = bufferSize / sizeof( double );
-        cri.count = (cri.lim + threadCount - 1) / threadCount;
-        ThreadPool_Do(ReferenceD, threadCount, &cri);
-    } else {
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoD cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (double *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->dfunc.f_ffpI;
+            cri.lim = bufferSize / sizeof(double);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceD, threadCount, &cri);
+        }
+        else
+        {
 #endif
             double *r = (double *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-                r[j] = (double) f->dfunc.f_ffpI( s[j], s2[j], r2+j );
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
 #if defined PARALLEL_REFERENCE
-    }
+        }
 #endif
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)gOut[k];
                 int32_t *q2 = (int32_t *)gOut2[k];
 
-        // Check for exact match to correctly rounded result
-        if (t[j] == q[j] && t2[j] == q2[j])
-            continue;
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
 
-        // Check for paired NaNs
-        if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL &&
-            (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL &&
-            t2[j] == q2[j])
-            continue;
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && t2[j] == q2[j])
+                    continue;
 
                 // if( t[j] != q[j] || t2[j] != q2[j] )
                 {
-                    double test = ((double*) q)[j];
+                    double test = ((double *)q)[j];
                     int correct2 = INT_MIN;
-                    long double correct = f->dfunc.f_ffpI( s[j], s2[j], &correct2 );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
+                    long double correct =
+                        f->dfunc.f_ffpI(s[j], s2[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
                     int64_t iErr;
 
-                    // in case of remquo, we only care about the sign and last seven bits of
-                    // integer as per the spec.
-                    if(testingRemquo)
-                        iErr = (long long) (q2[j] & 0x0000007f) - (long long) (correct2 & 0x0000007f);
+                    // in case of remquo, we only care about the sign and last
+                    // seven bits of integer as per the spec.
+                    if (testingRemquo)
+                        iErr = (long long)(q2[j] & 0x0000007f)
+                            - (long long)(correct2 & 0x0000007f);
                     else
-                        iErr = (long long) q2[j] - (long long) correct2;
-
-                    //For remquo, if y = 0, x is infinite, or either is NaN then the standard either neglects
-                    //to say what is returned in iptr or leaves it undefined or implementation defined.
-                    int iptrUndefined = fabs(((double*) gIn)[j]) == INFINITY ||
-                                        ((double*) gIn2)[j] == 0.0          ||
-                                        isnan(((double*) gIn2)[j])           ||
-                                        isnan(((double*) gIn)[j]);
-                    if(iptrUndefined)
-                         iErr = 0;
-
-                    int fail = ! (fabsf(err) <= f->double_ulps && iErr == 0 );
-                    if( ftz && fail )
+                        iErr = (long long)q2[j] - (long long)correct2;
+
+                    // For remquo, if y = 0, x is infinite, or either is NaN
+                    // then the standard either neglects to say what is returned
+                    // in iptr or leaves it undefined or implementation defined.
+                    int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
+                        || ((double *)gIn2)[j] == 0.0
+                        || isnan(((double *)gIn2)[j])
+                        || isnan(((double *)gIn)[j]);
+                    if (iptrUndefined) iErr = 0;
+
+                    int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
+                    if (ftz && fail)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
                             int correct3i, correct4i;
-                            long double correct3 = f->dfunc.f_ffpI( 0.0, s2[j], &correct3i );
-                            long double correct4 = f->dfunc.f_ffpI( -0.0, s2[j], &correct4i );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            long double correct3 =
+                                f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
+                            long double correct4 =
+                                f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps
+                                       && iErr3 == 0))
+                                    && (!(fabsf(err3) <= f->double_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with both args as zero
-                            if( IsDoubleSubnormal( s2[j] ) )
+                            // try with both args as zero
+                            if (IsDoubleSubnormal(s2[j]))
                             {
                                 int correct7i, correct8i;
-                                correct3 = f->dfunc.f_ffpI( 0.0, 0.0, &correct3i );
-                                correct4 = f->dfunc.f_ffpI( -0.0, 0.0, &correct4i );
-                                long double correct7 = f->dfunc.f_ffpI( 0.0, -0.0, &correct7i );
-                                long double correct8 = f->dfunc.f_ffpI( -0.0, -0.0, &correct8i );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct7  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct8  );
-                                iErr3 = (long long) q2[j] - (long long) correct3i;
-                                iErr4 = (long long) q2[j] - (long long) correct4i;
-                                int64_t iErr7 = (long long) q2[j] - (long long) correct7i;
-                                int64_t iErr8 = (long long) q2[j] - (long long) correct8i;
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps  && iErr4 == 0)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps  && iErr7 == 0)) && (!(fabsf(err5) <= f->double_ulps  && iErr8 == 0)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-                                if( llabs(iErr3) < llabs( iErr ) )
-                                    iErr = iErr3;
-                                if( llabs(iErr4) < llabs( iErr ) )
-                                    iErr = iErr4;
-                                if( llabs(iErr7) < llabs( iErr ) )
-                                    iErr = iErr7;
-                                if( llabs(iErr8) < llabs( iErr ) )
-                                    iErr = iErr8;
+                                correct3 =
+                                    f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
+                                correct4 =
+                                    f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
+                                long double correct7 =
+                                    f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
+                                long double correct8 =
+                                    f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct7);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct8);
+                                iErr3 = (long long)q2[j] - (long long)correct3i;
+                                iErr4 = (long long)q2[j] - (long long)correct4i;
+                                int64_t iErr7 =
+                                    (long long)q2[j] - (long long)correct7i;
+                                int64_t iErr8 =
+                                    (long long)q2[j] - (long long)correct8i;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps
+                                           && iErr3 == 0))
+                                        && (!(fabsf(err3) <= f->double_ulps
+                                              && iErr4 == 0))
+                                        && (!(fabsf(err4) <= f->double_ulps
+                                              && iErr7 == 0))
+                                        && (!(fabsf(err5) <= f->double_ulps
+                                              && iErr8 == 0)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+                                if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                                if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                                if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                                if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct3, f->double_ulps ) || IsDoubleResultSubnormal( correct4, f->double_ulps )  ||
-                                    IsDoubleResultSubnormal( correct7, f->double_ulps ) || IsDoubleResultSubnormal( correct8, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct3,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct7,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct8,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0 || iErr7 == 0 || iErr8 == 0));
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && (iErr3 == 0 || iErr4 == 0
+                                                 || iErr7 == 0 || iErr8 == 0));
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( IsDoubleSubnormal( s2[j] ) )
+                        else if (IsDoubleSubnormal(s2[j]))
                         {
                             int correct3i, correct4i;
-                            long double correct3 = f->dfunc.f_ffpI( s[j], 0.0, &correct3i );
-                            long double correct4 = f->dfunc.f_ffpI( s[j], -0.0, &correct4i );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            long double correct3 =
+                                f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
+                            long double correct4 =
+                                f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps
+                                       && iErr3 == 0))
+                                    && (!(fabsf(err3) <= f->double_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, %.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, %d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ 0x%16.16llx, 0x%8.8x})\n",
-                                    f->name, sizeNames[k], err, iErr,
-                                   ((double*) gIn)[j], ((double*) gIn2)[j],
-                                   ((cl_ulong*) gIn)[j], ((cl_ulong*) gIn2)[j],
-                                   ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j],
-                                   ((cl_ulong*) gOut_Ref)[j], ((cl_uint*) gOut_Ref2)[j],
-                                   test, q2[j],
-                                   ((cl_ulong*) q)[j], ((cl_uint*) q2)[j]);
-                      error = -1;
-                      goto exit;
+                        vlog_error(
+                            "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, "
+                            "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, "
+                            "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ "
+                            "0x%16.16llx, 0x%8.8x})\n",
+                            f->name, sizeNames[k], err, iErr,
+                            ((double *)gIn)[j], ((double *)gIn2)[j],
+                            ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j],
+                            ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j],
+                            ((cl_ulong *)gOut_Ref)[j],
+                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                            ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            p[j] = DoubleFromUInt32( genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -1129,6 +1365,3 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index f6bd1223fe..7e2073793a 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -33,60 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = ", name, "( f0 );\n"
-                            "       vstore3( i0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       int3 i0 = ", name, "( f0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -95,88 +112,109 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global double* in)\n"
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                        "       int3 i0 = ", name, "( f0 );\n"
-                        "       vstore3( i0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       double3 f0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       int3 i0 = ", name, "( f0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = i0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = i0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -187,12 +225,12 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ;
-    size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
@@ -206,191 +244,226 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j * scale;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         float *s = (float *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = f->func.i_f( s[j] );
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = f->func.i_f(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    if( ftz && IsFloatSubnormal(s[j]))
+                    if (ftz && IsFloatSubnormal(s[j]))
                     {
-                        unsigned int correct0 = f->func.i_f( 0.0 );
-                        unsigned int correct1 = f->func.i_f( -0.0 );
-                        if( q[j] == correct0 || q[j] == correct1 )
-                            continue;
+                        unsigned int correct0 = f->func.i_f(0.0);
+                        unsigned int correct1 = f->func.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
                     }
 
                     uint32_t err = t[j] - q[j];
-                    if( q[j] > t[j] )
-                        err = q[j] - t[j];
-                    vlog_error( "\nERROR: %s%s: %d ulp error at %a (0x%8.8x): *%d vs. %d\n", f->name, sizeNames[k], err, ((float*) gIn)[j], ((cl_uint*) gIn)[j], t[j], q[j] );
-                  error = -1;
-                  goto exit;
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
+                               "*%d vs. %d\n",
+                               f->name, sizeNames[k], err, ((float *)gIn)[j],
+                               ((cl_uint *)gIn)[j], t[j], q[j]);
+                    error = -1;
+                    goto exit;
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    vlog( "\n" );
+    vlog("\n");
 exit:
     RestoreFPState(&oldMode);
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -404,12 +477,12 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || gForceFTZ;
-    size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -423,200 +496,231 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32( (uint32_t) i + j * scale );
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32( (uint32_t) i + j );
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         double *s = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-            r[j] = f->dfunc.i_f( s[j] );
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            r[j] = f->dfunc.i_f(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    if( ftz && IsDoubleSubnormal(s[j]))
+                    if (ftz && IsDoubleSubnormal(s[j]))
                     {
-                        unsigned int correct0 = f->dfunc.i_f( 0.0 );
-                        unsigned int correct1 = f->dfunc.i_f( -0.0 );
-                        if( q[j] == correct0 || q[j] == correct1 )
-                            continue;
+                        unsigned int correct0 = f->dfunc.i_f(0.0);
+                        unsigned int correct1 = f->dfunc.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
                     }
 
                     uint32_t err = t[j] - q[j];
-                    if( q[j] > t[j] )
-                        err = q[j] - t[j];
-                    vlog_error( "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", f->name, sizeNames[k], err, ((double*) gIn)[j], t[j], q[j] );
-                  error = -1;
-                  goto exit;
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error(
+                        "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n",
+                        f->name, sizeNames[k], err, ((double *)gIn)[j], t[j],
+                        q[j]);
+                    error = -1;
+                    goto exit;
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
             if (gVerboseBruteForce)
             {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-            } else
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
             {
-               vlog("." );
+                vlog(".");
             }
-           fflush(stdout);
-
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-            p[j] = DoubleFromUInt32( genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 
 exit:
     RestoreFPState(&oldMode);
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -624,4 +728,3 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index 1cde215ce3..52c4e96cd7 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -24,7 +24,8 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata, bool relaxedMode);
 extern const vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float,
                                     TestMacro_Int_Double_Double };
 
-static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p);
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode);
@@ -32,26 +33,42 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i] );\n"
-        "}\n"
-    };
-
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in, __global float* in2)\n"
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in, __global float* in2)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
         "   {\n"
         "       float3 f0 = vload3( 0, in + 3 * i );\n"
         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = ", name, "( f0, f1 );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
         "       vstore3( i0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       float3 f0, f1;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -64,7 +81,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       int3 i0 = ", name, "( f0, f1 );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -80,16 +99,17 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
 
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -101,27 +121,43 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i] );\n"
-        "}\n"
-    };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in, __global double* in2)\n"
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global long",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global long* out, __global double* in, __global double* in2)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
         "   {\n"
         "       double3 f0 = vload3( 0, in + 3 * i );\n"
         "       double3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       long3 l0 = ", name, "( f0, f1 );\n"
+        "       long3 l0 = ",
+        name,
+        "( f0, f1 );\n"
         "       vstore3( l0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       double3 f0, f1;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -134,7 +170,9 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         "               f1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       long3 l0 = ", name, "( f0, f1 );\n"
+        "       long3 l0 = ",
+        name,
+        "( f0, f1 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -149,17 +187,18 @@ static int BuildKernelDouble(const char *name, int vectorSize,
     };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -167,27 +206,31 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
@@ -197,72 +240,165 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
-
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static const size_t specialValuesFloatCount = sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    int         ftz;                                // non-zero if running in flush to zero mode
-
-}TestInfo;
-
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -271,58 +407,79 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
 
     test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -335,393 +492,446 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
 
     // Run the kernels
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    fptr        func = job->f->func;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_int      *t,*r;
-    cl_float    *s,*s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_int *t, *r;
+    cl_float *s, *s2;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_int  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_int *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         float *fp = (float *)p;
         float *fp2 = (float *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesFloatCount;
-    y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if( ++x >= specialValuesFloatCount )
+            if (++x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesFloatCount )
-                    break;
+                if (y >= specialValuesFloatCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int32(d);
         p2[j] = genrand_int32(d);
     }
 
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_int *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (float *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = func.i_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
 
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                          0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_int *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
         cl_int *q = out[0];
 
-        if( gMinVectorSizeIndex == 0 && t[j] != q[j] )
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
-            if( ftz )
+            if (ftz)
             {
-                if( IsFloatSubnormal( s[j])  )
+                if (IsFloatSubnormal(s[j]))
                 {
-                    if( IsFloatSubnormal( s2[j] )  )
+                    if (IsFloatSubnormal(s2[j]))
                     {
-                        int correct = func.i_ff( 0.0f, 0.0f );
-                        int correct2 = func.i_ff( 0.0f, -0.0f );
-                        int correct3 = func.i_ff( -0.0f, 0.0f );
-                        int correct4 = func.i_ff( -0.0f, -0.0f );
+                        int correct = func.i_ff(0.0f, 0.0f);
+                        int correct2 = func.i_ff(0.0f, -0.0f);
+                        int correct3 = func.i_ff(-0.0f, 0.0f);
+                        int correct4 = func.i_ff(-0.0f, -0.0f);
 
-                        if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
                             continue;
                     }
                     else
                     {
-                        int correct = func.i_ff( 0.0f, s2[j] );
-                        int correct2 = func.i_ff( -0.0f, s2[j] );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int correct = func.i_ff(0.0f, s2[j]);
+                        int correct2 = func.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
-                else if( IsFloatSubnormal( s2[j] ) )
+                else if (IsFloatSubnormal(s2[j]))
                 {
-                    int correct = func.i_ff( s[j], 0.0f );
-                    int correct2 = func.i_ff( s[j], -0.0f );
-                    if( correct == q[j] || correct2 == q[j]  )
-                        continue;
+                    int correct = func.i_ff(s[j], 0.0f);
+                    int correct2 = func.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
                 }
-
             }
 
             uint32_t err = t[j] - q[j];
-            if( q[j] > t[j] )
-                err = q[j] - t[j];
-            vlog_error( "\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. 0x%8.8x (index: %d)\n", name, err, ((float*) s)[j], ((float*) s2)[j], t[j], q[j], j );
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
+                       "0x%8.8x (index: %d)\n",
+                       name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
+                       j);
             error = -1;
             goto exit;
         }
 
-        for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
-            if( -t[j] != q[j] )
+            if (-t[j] != q[j])
             {
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsFloatSubnormal( s[j])  )
+                    if (IsFloatSubnormal(s[j]))
                     {
-                        if( IsFloatSubnormal( s2[j] )  )
+                        if (IsFloatSubnormal(s2[j]))
                         {
-                            int correct = -func.i_ff( 0.0f, 0.0f );
-                            int correct2 = -func.i_ff( 0.0f, -0.0f );
-                            int correct3 = -func.i_ff( -0.0f, 0.0f );
-                            int correct4 = -func.i_ff( -0.0f, -0.0f );
+                            int correct = -func.i_ff(0.0f, 0.0f);
+                            int correct2 = -func.i_ff(0.0f, -0.0f);
+                            int correct3 = -func.i_ff(-0.0f, 0.0f);
+                            int correct4 = -func.i_ff(-0.0f, -0.0f);
 
-                            if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
                                 continue;
                         }
                         else
                         {
-                            int correct = -func.i_ff( 0.0f, s2[j] );
-                            int correct2 = -func.i_ff( -0.0f, s2[j] );
-                            if( correct == q[j] || correct2 == q[j]  )
-                                continue;
+                            int correct = -func.i_ff(0.0f, s2[j]);
+                            int correct2 = -func.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
                         }
                     }
-                    else if( IsFloatSubnormal( s2[j] ) )
+                    else if (IsFloatSubnormal(s2[j]))
                     {
-                        int correct = -func.i_ff( s[j], 0.0f );
-                        int correct2 = -func.i_ff( s[j], -0.0f );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int correct = -func.i_ff(s[j], 0.0f);
+                        int correct2 = -func.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
-
                 }
                 cl_uint err = -t[j] - q[j];
-                if( q[j] > -t[j] )
-                    err = q[j] + t[j];
-                vlog_error( "\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x vs. 0x%8.8x (index: %d)\n", name, sizeNames[k], err, ((float*) s)[j], ((float*) s2)[j], -t[j], q[j], j );
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
+                           "vs. 0x%8.8x (index: %d)\n",
+                           name, sizeNames[k], err, ((float *)s)[j],
+                           ((float *)s2)[j], -t[j], q[j], j);
                 error = -1;
                 goto exit;
             }
         }
     }
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
 exit:
@@ -731,50 +941,146 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
-
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -785,58 +1091,79 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -850,402 +1177,455 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint64_t *p = (uint64_t *)gIn;
         uint64_t *p2 = (uint64_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
-            p[j] = (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
-            p2[j] = (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
+            p[j] =
+                (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
+            p2[j] =
+                (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    dptr        dfunc = job->f->dfunc;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_long     *t,*r;
-    cl_double   *s,*s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    dptr dfunc = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_long *t, *r;
+    cl_double *s, *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_long  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_long *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     double *p = (double *)gIn + thread_id * buffer_elements;
     double *p2 = (double *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             p[j] = specialValuesDouble[x];
             p2[j] = specialValuesDouble[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesDoubleCount )
-                    break;
+                if (y >= specialValuesDoubleCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
-        ((cl_ulong*)p)[j] = genrand_int64(d);
-        ((cl_ulong*)p2)[j] = genrand_int64(d);
+        ((cl_ulong *)p)[j] = genrand_int64(d);
+        ((cl_ulong *)p2)[j] = genrand_int64(d);
     }
 
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_long *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = dfunc.i_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_ff(s[j], s2[j]);
 
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_long *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        cl_long *q = (cl_long *) out[0];
+        cl_long *q = (cl_long *)out[0];
 
         // If we aren't getting the correctly rounded result
-        if( gMinVectorSizeIndex == 0 && t[j] != q[j] )
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
-            if( ftz )
+            if (ftz)
             {
-                if( IsDoubleSubnormal( s[j])  )
+                if (IsDoubleSubnormal(s[j]))
                 {
-                    if( IsDoubleSubnormal( s2[j] )  )
+                    if (IsDoubleSubnormal(s2[j]))
                     {
-                        int64_t correct = dfunc.i_ff( 0.0f, 0.0f );
-                        int64_t correct2 = dfunc.i_ff( 0.0f, -0.0f );
-                        int64_t correct3 = dfunc.i_ff( -0.0f, 0.0f );
-                        int64_t correct4 = dfunc.i_ff( -0.0f, -0.0f );
+                        int64_t correct = dfunc.i_ff(0.0f, 0.0f);
+                        int64_t correct2 = dfunc.i_ff(0.0f, -0.0f);
+                        int64_t correct3 = dfunc.i_ff(-0.0f, 0.0f);
+                        int64_t correct4 = dfunc.i_ff(-0.0f, -0.0f);
 
-                        if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
                             continue;
                     }
                     else
                     {
-                        int64_t correct = dfunc.i_ff( 0.0f, s2[j] );
-                        int64_t correct2 = dfunc.i_ff( -0.0f, s2[j] );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int64_t correct = dfunc.i_ff(0.0f, s2[j]);
+                        int64_t correct2 = dfunc.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
-                else if( IsDoubleSubnormal( s2[j] ) )
+                else if (IsDoubleSubnormal(s2[j]))
                 {
-                    int64_t correct = dfunc.i_ff( s[j], 0.0f );
-                    int64_t correct2 = dfunc.i_ff( s[j], -0.0f );
-                    if( correct == q[j] || correct2 == q[j]  )
-                        continue;
+                    int64_t correct = dfunc.i_ff(s[j], 0.0f);
+                    int64_t correct2 = dfunc.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
                 }
-
             }
 
             uint64_t err = t[j] - q[j];
-            if( q[j] > t[j] )
-                err = q[j] - t[j];
-            vlog_error( "\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld vs. %lld  (index: %d)\n", name, err, ((double*) s)[j], ((double*) s2)[j], t[j], q[j], j );
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld "
+                       "vs. %lld  (index: %d)\n",
+                       name, err, ((double *)s)[j], ((double *)s2)[j], t[j],
+                       q[j], j);
             error = -1;
             goto exit;
         }
 
 
-        for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
-            q = (cl_long*) out[k];
+            q = (cl_long *)out[k];
             // If we aren't getting the correctly rounded result
-            if( -t[j] != q[j] )
+            if (-t[j] != q[j])
             {
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsDoubleSubnormal( s[j])  )
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        if( IsDoubleSubnormal( s2[j] )  )
+                        if (IsDoubleSubnormal(s2[j]))
                         {
-                            int64_t correct = -dfunc.i_ff( 0.0f, 0.0f );
-                            int64_t correct2 = -dfunc.i_ff( 0.0f, -0.0f );
-                            int64_t correct3 = -dfunc.i_ff( -0.0f, 0.0f );
-                            int64_t correct4 = -dfunc.i_ff( -0.0f, -0.0f );
+                            int64_t correct = -dfunc.i_ff(0.0f, 0.0f);
+                            int64_t correct2 = -dfunc.i_ff(0.0f, -0.0f);
+                            int64_t correct3 = -dfunc.i_ff(-0.0f, 0.0f);
+                            int64_t correct4 = -dfunc.i_ff(-0.0f, -0.0f);
 
-                            if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
                                 continue;
                         }
                         else
                         {
-                            int64_t correct = -dfunc.i_ff( 0.0f, s2[j] );
-                            int64_t correct2 = -dfunc.i_ff( -0.0f, s2[j] );
-                            if( correct == q[j] || correct2 == q[j]  )
-                                continue;
+                            int64_t correct = -dfunc.i_ff(0.0f, s2[j]);
+                            int64_t correct2 = -dfunc.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
                         }
                     }
-                    else if( IsDoubleSubnormal( s2[j] ) )
+                    else if (IsDoubleSubnormal(s2[j]))
                     {
-                        int64_t correct = -dfunc.i_ff( s[j], 0.0f );
-                        int64_t correct2 = -dfunc.i_ff( s[j], -0.0f );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int64_t correct = -dfunc.i_ff(s[j], 0.0f);
+                        int64_t correct2 = -dfunc.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
-
                 }
 
                 uint64_t err = -t[j] - q[j];
-                if( q[j] > -t[j] )
-                    err = q[j] + t[j];
-                vlog_error( "\nERROR: %sD%s: %lld ulp error at {%.13la, %.13la}: *%lld vs. %lld  (index: %d)\n", name, sizeNames[k], err, ((double*) s)[j], ((double*) s2)[j], -t[j], q[j], j );
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %sD%s: %lld ulp error at {%.13la, "
+                           "%.13la}: *%lld vs. %lld  (index: %d)\n",
+                           name, sizeNames[k], err, ((double *)s)[j],
+                           ((double *)s2)[j], -t[j], q[j], j);
                 error = -1;
                 goto exit;
             }
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
 exit:
     return error;
 }
-
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index 70f724ceb6..26a186f640 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -33,60 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = ", name, "( f0 );\n"
-                            "       vstore3( i0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       int3 i0;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       i0 = ", name, "( f0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 i0;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -97,62 +114,79 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in)\n"
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global long",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                        "       long3 l0 = ", name, "( d0 );\n"
-                        "       vstore3( l0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       double3 d0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       long3 l0 = ", name, "( d0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = l0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = l0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global long* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       vstore3( l0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = l0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = l0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -160,80 +194,90 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
                              info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    int         ftz;                                // non-zero if running in flush to zero mode
-
-}TestInfo;
-
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
-    if (gWimpyMode )
+    if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -242,51 +286,68 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     }
 
     test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gOutBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
     }
@@ -297,281 +358,315 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         cl_uint *p = (cl_uint *)gIn;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_float );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr    func = job->f->func;
-    int     ftz = job->ftz;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
     cl_uint j, k;
     cl_int error = CL_SUCCESS;
-    cl_int ret   = CL_SUCCESS;
+    cl_int ret = CL_SUCCESS;
     const char *name = job->f->name;
 
     int signbit_test = 0;
-    if(!strcmp(name, "signbit"))
-        signbit_test = 1;
+    if (!strcmp(name, "signbit")) signbit_test = 1;
 
-    #define ref_func(s) ( signbit_test ? func.i_f_f( s ) : func.i_f( s ) )
+#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_int  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_int *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        p[j] = base + j * scale;
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
     float *s = (float *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = ref_func( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                          0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
-    //Verify data
+    // Verify data
     cl_int *t = (cl_int *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_int *q = out[0];
 
             // If we aren't getting the correctly rounded result
-            if( gMinVectorSizeIndex == 0 && t[j] != q[j])
+            if (gMinVectorSizeIndex == 0 && t[j] != q[j])
             {
                 // If we aren't getting the correctly rounded result
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsFloatSubnormal( s[j]) )
+                    if (IsFloatSubnormal(s[j]))
                     {
-                        int correct = ref_func( +0.0f );
-                        int correct2 = ref_func( -0.0f );
-                        if( correct == q[j] || correct2 == q[j] )
-                            continue;
+                        int correct = ref_func(+0.0f);
+                        int correct2 = ref_func(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
 
                 uint32_t err = t[j] - q[j];
-                if( q[j] > t[j] )
-                    err = q[j] - t[j];
-                vlog_error( "\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", name,  err, ((float*) s)[j], t[j], q[j] );
+                if (q[j] > t[j]) err = q[j] - t[j];
+                vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n",
+                           name, err, ((float *)s)[j], t[j], q[j]);
                 error = -1;
                 goto exit;
             }
 
 
-            for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+            for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
             {
                 q = out[k];
                 // If we aren't getting the correctly rounded result
-                if( -t[j] != q[j] )
+                if (-t[j] != q[j])
                 {
-                    if( ftz )
+                    if (ftz)
                     {
-                        if( IsFloatSubnormal( s[j]))
+                        if (IsFloatSubnormal(s[j]))
                         {
-                            int correct = -ref_func( +0.0f );
-                            int correct2 = -ref_func( -0.0f );
-                            if( correct == q[j] || correct2 == q[j] )
-                                continue;
+                            int correct = -ref_func(+0.0f);
+                            int correct2 = -ref_func(-0.0f);
+                            if (correct == q[j] || correct2 == q[j]) continue;
                         }
                     }
 
                     uint32_t err = -t[j] - q[j];
-                    if( q[j] > -t[j] )
-                        err = q[j] + t[j];
-                    vlog_error( "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, sizeNames[k], err, ((float*) s)[j], -t[j], q[j] );
-                  error = -1;
-                  goto exit;
+                    if (q[j] > -t[j]) err = q[j] + t[j];
+                    vlog_error(
+                        "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name,
+                        sizeNames[k], err, ((float *)s)[j], -t[j], q[j]);
+                    error = -1;
+                    goto exit;
                 }
             }
         }
@@ -579,60 +674,69 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
 
 exit:
     ret = error;
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
+    if ((error = clFlush(tinfo->tQueue)))
     {
-        vlog( "clFlush 3 failed\n" );
+        vlog("clFlush 3 failed\n");
         return error;
     }
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
     return ret;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
-    if (gWimpyMode )
+    if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -643,52 +747,69 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
     }
@@ -699,117 +820,131 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         cl_ulong *p = (cl_ulong *)gIn;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
             p[j] = DoubleFromUInt32(genrand_int32(d));
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_double );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    dptr    dfunc = job->f->dfunc;
+    dptr dfunc = job->f->dfunc;
     cl_uint j, k;
     cl_int error;
     int ftz = job->ftz;
@@ -818,189 +953,209 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_long *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_long *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_double *p = (cl_double*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        p[j] = DoubleFromUInt32( base + j * scale);
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
     cl_double *s = (cl_double *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = dfunc.i_f( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
 
-    //Verify data
+    // Verify data
     cl_long *t = (cl_long *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
         cl_long *q = out[0];
 
 
         // If we aren't getting the correctly rounded result
-        if( gMinVectorSizeIndex == 0 && t[j] != q[j])
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
             // If we aren't getting the correctly rounded result
-            if( ftz )
+            if (ftz)
             {
-                if( IsDoubleSubnormal( s[j]) )
+                if (IsDoubleSubnormal(s[j]))
                 {
-                    cl_long correct = dfunc.i_f( +0.0f );
-                    cl_long correct2 = dfunc.i_f( -0.0f );
-                    if( correct == q[j] || correct2 == q[j] )
-                        continue;
+                    cl_long correct = dfunc.i_f(+0.0f);
+                    cl_long correct2 = dfunc.i_f(-0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
                 }
             }
 
             cl_ulong err = t[j] - q[j];
-            if( q[j] > t[j] )
-                err = q[j] - t[j];
-            vlog_error( "\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", name,  err, ((double*) gIn)[j], t[j], q[j] );
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                       name, err, ((double *)gIn)[j], t[j], q[j]);
             return -1;
         }
 
 
-        for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
-            if( -t[j] != q[j] )
+            if (-t[j] != q[j])
             {
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsDoubleSubnormal( s[j]))
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        int64_t correct = -dfunc.i_f( +0.0f );
-                        int64_t correct2 = -dfunc.i_f( -0.0f );
-                        if( correct == q[j] || correct2 == q[j] )
-                            continue;
+                        int64_t correct = -dfunc.i_f(+0.0f);
+                        int64_t correct2 = -dfunc.i_f(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
 
                 cl_ulong err = -t[j] - q[j];
-                if( q[j] > -t[j] )
-                    err = q[j] + t[j];
-                vlog_error( "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", name, sizeNames[k], err, ((double*) gIn)[j], -t[j], q[j] );
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error(
+                    "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                    name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
                 return -1;
             }
         }
-
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
     return CL_SUCCESS;
 }
-
-
-
-
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index ed1d7d53fb..9292649aa3 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -31,66 +31,87 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2,  __global float", sizeNames[vectorSize], "* in3 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2, __global float* in3)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-                            "       f0 = ", name, "( f0, f1, f2 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1, f2;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, f1, f2 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2,  __global float",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2, "
+        "__global float* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1, f2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -98,94 +119,119 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2,  __global double", sizeNames[vectorSize], "* in3 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2, __global double* in3)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-                            "       d0 = ", name, "( d0, d1, d2 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1, d2;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = ", name, "( d0, d1, d2 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2,  __global double",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2, "
+        "__global double* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1, d2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -199,455 +245,503 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-//    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    //    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM &
+    //    gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
         float *s3 = (float *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = (float) f->func.f_fff( s[j], s2[j], s3[j] );
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data  -- Commented out on purpose. no verification possible. MAD is a random number generator.
-/*
-        uint32_t *t = gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-        {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
-            {
-                uint32_t *q = gOut[k];
-
-                // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+        // Verify data  -- Commented out on purpose. no verification possible.
+        // MAD is a random number generator.
+        /*
+                uint32_t *t = gOut_Ref;
+                for( j = 0; j < bufferSize / sizeof( float ); j++ )
                 {
-                    float test = ((float*) q)[j];
-                    double correct = f->func.f_fff( s[j], s2[j], s3[j] );
-                    float err = Ulp_Error( test, correct );
-                    int fail = ! (fabsf(err) <= f->float_ulps);
-
-                    if( fail && ftz )
+                    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
                     {
-                        // retry per section 6.5.3.2
-                        if( IsFloatSubnormal(correct) )
-                        { // look at me,
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
-                        }
+                        uint32_t *q = gOut[k];
 
-                        // retry per section 6.5.3.3
-                        if( fail && IsFloatSubnormal( s[j] ) )
-                        { // look at me,
-                            double correct2 = f->func.f_fff( 0.0, s2[j], s3[j] );
-                            double correct3 = f->func.f_fff( -0.0, s2[j], s3[j] );
-                            float err2 = Ulp_Error( test, correct2  );
-                            float err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                            { // look at me now,
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
+                        // If we aren't getting the correctly rounded result
+                        if( t[j] != q[j] )
+                        {
+                            float test = ((float*) q)[j];
+                            double correct = f->func.f_fff( s[j], s2[j], s3[j]
+         ); float err = Ulp_Error( test, correct ); int fail = ! (fabsf(err) <=
+         f->float_ulps);
 
-                            //try with first two args as zero
-                            if( IsFloatSubnormal( s2[j] ) )
-                            { // its fun to have fun,
-                                correct2 = f->func.f_fff( 0.0, 0.0, s3[j] );
-                                correct3 = f->func.f_fff( -0.0, 0.0, s3[j] );
-                                double correct4 = f->func.f_fff( 0.0, -0.0, s3[j] );
-                                double correct5 = f->func.f_fff( -0.0, -0.0, s3[j] );
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                float err4 = Ulp_Error( test, correct4  );
-                                float err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                 (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
-                                    IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
+                            if( fail && ftz )
+                            {
+                                // retry per section 6.5.3.2
+                                if( IsFloatSubnormal(correct) )
+                                { // look at me,
+                                    fail = fail && ( test != 0.0f );
                                     if( ! fail )
                                         err = 0.0f;
                                 }
 
-                                if( IsFloatSubnormal( s3[j] )  )
-                                { // but you have to know how!
-                                    correct2 = f->func.f_fff( 0.0, 0.0, 0.0f );
-                                    correct3 = f->func.f_fff( -0.0, 0.0, 0.0f );
-                                    correct4 = f->func.f_fff( 0.0, -0.0, 0.0f );
-                                    correct5 = f->func.f_fff( -0.0, -0.0, 0.0f );
-                                    double correct6 = f->func.f_fff( 0.0, 0.0, -0.0f );
-                                    double correct7 = f->func.f_fff( -0.0, 0.0, -0.0f );
-                                    double correct8 = f->func.f_fff( 0.0, -0.0, -0.0f );
-                                    double correct9 = f->func.f_fff( -0.0, -0.0, -0.0f );
-                                    err2 = Ulp_Error( test, correct2  );
-                                    err3 = Ulp_Error( test, correct3  );
-                                    err4 = Ulp_Error( test, correct4  );
-                                    err5 = Ulp_Error( test, correct5  );
-                                    float err6 = Ulp_Error( test, correct6  );
-                                    float err7 = Ulp_Error( test, correct7  );
-                                    float err8 = Ulp_Error( test, correct8  );
-                                    float err9 = Ulp_Error( test, correct9  );
-                                    fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                     (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)) &&
-                                                     (!(fabsf(err5) <= f->float_ulps)) && (!(fabsf(err6) <= f->float_ulps)) &&
-                                                     (!(fabsf(err7) <= f->float_ulps)) && (!(fabsf(err8) <= f->float_ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
+                                // retry per section 6.5.3.3
+                                if( fail && IsFloatSubnormal( s[j] ) )
+                                { // look at me,
+                                    double correct2 = f->func.f_fff( 0.0, s2[j],
+         s3[j] ); double correct3 = f->func.f_fff( -0.0, s2[j], s3[j] ); float
+         err2 = Ulp_Error( test, correct2  ); float err3 = Ulp_Error( test,
+         correct3  ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) &&
+         (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) )
                                         err = err2;
                                     if( fabsf( err3 ) < fabsf(err ) )
                                         err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
-                                    if( fabsf( err6 ) < fabsf(err ) )
-                                        err = err6;
-                                    if( fabsf( err7 ) < fabsf(err ) )
-                                        err = err7;
-                                    if( fabsf( err8 ) < fabsf(err ) )
-                                        err = err8;
-                                    if( fabsf( err9 ) < fabsf(err ) )
-                                        err = err9;
 
                                     // retry per section 6.5.3.4
-                                    if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
-                                        IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps )  ||
-                                        IsFloatResultSubnormal( correct6, f->float_ulps ) || IsFloatResultSubnormal(correct7, f->float_ulps )  ||
-                                        IsFloatResultSubnormal(correct8, f->float_ulps ) || IsFloatResultSubnormal( correct9, f->float_ulps )  )
+                                    if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
+                                    { // look at me now,
+                                        fail = fail && ( test != 0.0f);
+                                        if( ! fail )
+                                            err = 0.0f;
+                                    }
+
+                                    //try with first two args as zero
+                                    if( IsFloatSubnormal( s2[j] ) )
+                                    { // its fun to have fun,
+                                        correct2 = f->func.f_fff( 0.0, 0.0,
+         s3[j] ); correct3 = f->func.f_fff( -0.0, 0.0, s3[j] ); double correct4
+         = f->func.f_fff( 0.0, -0.0, s3[j] ); double correct5 = f->func.f_fff(
+         -0.0, -0.0, s3[j] ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
+                                        float err5 = Ulp_Error( test, correct5
+         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
+         <= f->float_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
+                                            IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+
+                                        if( IsFloatSubnormal( s3[j] )  )
+                                        { // but you have to know how!
+                                            correct2 = f->func.f_fff( 0.0, 0.0,
+         0.0f ); correct3 = f->func.f_fff( -0.0, 0.0, 0.0f ); correct4 =
+         f->func.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->func.f_fff( -0.0, -0.0,
+         0.0f ); double correct6 = f->func.f_fff( 0.0, 0.0, -0.0f ); double
+         correct7 = f->func.f_fff( -0.0, 0.0, -0.0f ); double correct8 =
+         f->func.f_fff( 0.0, -0.0, -0.0f ); double correct9 = f->func.f_fff(
+         -0.0, -0.0, -0.0f ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); err4 = Ulp_Error( test, correct4  ); err5
+         = Ulp_Error( test, correct5  ); float err6 = Ulp_Error( test, correct6
+         ); float err7 = Ulp_Error( test, correct7  ); float err8 = Ulp_Error(
+         test, correct8  ); float err9 = Ulp_Error( test, correct9  ); fail =
+         fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <=
+         f->float_ulps)) &&
+                                                             (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)) &&
+                                                             (!(fabsf(err5) <=
+         f->float_ulps)) && (!(fabsf(err6) <= f->float_ulps)) &&
+                                                             (!(fabsf(err7) <=
+         f->float_ulps)) && (!(fabsf(err8) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5; if( fabsf( err6 ) < fabsf(err ) ) err = err6;
+                                            if( fabsf( err7 ) < fabsf(err ) )
+                                                err = err7;
+                                            if( fabsf( err8 ) < fabsf(err ) )
+                                                err = err8;
+                                            if( fabsf( err9 ) < fabsf(err ) )
+                                                err = err9;
+
+                                            // retry per section 6.5.3.4
+                                            if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
+                                                IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps )  ||
+                                                IsFloatResultSubnormal(
+         correct6, f->float_ulps ) || IsFloatResultSubnormal(correct7,
+         f->float_ulps )  || IsFloatResultSubnormal(correct8, f->float_ulps ) ||
+         IsFloatResultSubnormal( correct9, f->float_ulps )  )
+                                            {
+                                                fail = fail && ( test != 0.0f);
+                                                if( ! fail )
+                                                    err = 0.0f;
+                                            }
+                                        }
+                                    }
+                                    else if( IsFloatSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->func.f_fff( 0.0, s2[j],
+         0.0 ); correct3 = f->func.f_fff( -0.0, s2[j], 0.0 ); double correct4 =
+         f->func.f_fff( 0.0,  s2[j], -0.0 ); double correct5 = f->func.f_fff(
+         -0.0, s2[j], -0.0 ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
+                                        float err5 = Ulp_Error( test, correct5
+         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
+         <= f->float_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
+                                            IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
+                                }
+                                else if( fail && IsFloatSubnormal( s2[j] ) )
+                                {
+                                    double correct2 = f->func.f_fff( s[j], 0.0,
+         s3[j] ); double correct3 = f->func.f_fff( s[j], -0.0, s3[j] ); float
+         err2 = Ulp_Error( test, correct2  ); float err3 = Ulp_Error( test,
+         correct3  ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) &&
+         (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) )
+                                        err = err2;
+                                    if( fabsf( err3 ) < fabsf(err ) )
+                                        err = err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsFloatResultSubnormal(correct2,
+         f->float_ulps )  || IsFloatResultSubnormal(correct3, f->float_ulps ) )
                                     {
                                         fail = fail && ( test != 0.0f);
                                         if( ! fail )
                                             err = 0.0f;
                                     }
+
+                                    //try with second two args as zero
+                                    if( IsFloatSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->func.f_fff( s[j], 0.0, 0.0
+         ); correct3 = f->func.f_fff( s[j], -0.0, 0.0 ); double correct4 =
+         f->func.f_fff( s[j], 0.0, -0.0 ); double correct5 = f->func.f_fff(
+         s[j], -0.0, -0.0 ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
+                                        float err5 = Ulp_Error( test, correct5
+         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
+         <= f->float_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
+                                            IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
                                 }
-                            }
-                            else if( IsFloatSubnormal( s3[j] ) )
-                            {
-                                correct2 = f->func.f_fff( 0.0, s2[j], 0.0 );
-                                correct3 = f->func.f_fff( -0.0, s2[j], 0.0 );
-                                double correct4 = f->func.f_fff( 0.0,  s2[j], -0.0 );
-                                double correct5 = f->func.f_fff( -0.0, s2[j], -0.0 );
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                float err4 = Ulp_Error( test, correct4  );
-                                float err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                 (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
-                                    IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
+                                else if( fail && IsFloatSubnormal(s3[j]) )
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    double correct2 = f->func.f_fff( s[j],
+         s2[j], 0.0 ); double correct3 = f->func.f_fff( s[j], s2[j], -0.0 );
+                                    float err2 = Ulp_Error( test, correct2  );
+                                    float err3 = Ulp_Error( test, correct3  );
+                                    fail =  fail && ((!(fabsf(err2) <=
+         f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
+                                    {
+                                        fail = fail && ( test != 0.0f);
+                                        if( ! fail )
+                                            err = 0.0f;
+                                    }
                                 }
                             }
-                        }
-                        else if( fail && IsFloatSubnormal( s2[j] ) )
-                        {
-                            double correct2 = f->func.f_fff( s[j], 0.0, s3[j] );
-                            double correct3 = f->func.f_fff( s[j], -0.0, s3[j] );
-                            float err2 = Ulp_Error( test, correct2  );
-                            float err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, f->float_ulps )  || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                            {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
 
-                            //try with second two args as zero
-                            if( IsFloatSubnormal( s3[j] ) )
+                            if( fabsf(err ) > maxError )
                             {
-                                correct2 = f->func.f_fff( s[j], 0.0, 0.0 );
-                                correct3 = f->func.f_fff( s[j], -0.0, 0.0 );
-                                double correct4 = f->func.f_fff( s[j], 0.0, -0.0 );
-                                double correct5 = f->func.f_fff( s[j], -0.0, -0.0 );
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                float err4 = Ulp_Error( test, correct4  );
-                                float err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                 (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
-                                    IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
-                                }
+                                maxError = fabsf(err);
+                                maxErrorVal = s[j];
+                                maxErrorVal2 = s2[j];
+                                maxErrorVal3 = s3[j];
                             }
-                        }
-                        else if( fail && IsFloatSubnormal(s3[j]) )
-                        {
-                            double correct2 = f->func.f_fff( s[j], s2[j], 0.0 );
-                            double correct3 = f->func.f_fff( s[j], s2[j], -0.0 );
-                            float err2 = Ulp_Error( test, correct2  );
-                            float err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
+
+                            if( fail )
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                vlog_error( "\nERROR: %s%s: %f ulp error at {%a,
+         %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j],
+         ((float*) gOut_Ref)[j], test ); error = -1; goto exit;
                             }
                         }
                     }
-
-                    if( fabsf(err ) > maxError )
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                        maxErrorVal2 = s2[j];
-                        maxErrorVal3 = s3[j];
-                    }
-
-                    if( fail )
-                    {
-                        vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((float*) gOut_Ref)[j], test );
- error = -1;
- goto exit;
-                    }
                 }
-            }
-        }
-*/
-        if( 0 == (i & 0x0fffffff) )
+        */
+        if (0 == (i & 0x0fffffff))
         {
-            vlog("." );
+            vlog(".");
             fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "pass" );
+            vlog("pass");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -661,14 +755,14 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-//    int ftz = f->ftz || gForceFTZ;
+    //    int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     uint64_t step = getTestStep(sizeof(double), bufferSize);
@@ -676,448 +770,511 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            r[j] = (double) f->dfunc.f_fff( s[j], s2[j], s3[j] );
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
-
-        //Verify data  -- Commented out on purpose. no verification possible. MAD is a random number generator.
-/*
-        uint64_t *t = gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-        {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
-            {
-                uint64_t *q = gOut[k];
+        if (gSkipCorrectnessTesting) break;
 
-                // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+        // Verify data  -- Commented out on purpose. no verification possible.
+        // MAD is a random number generator.
+        /*
+                uint64_t *t = gOut_Ref;
+                for( j = 0; j < bufferSize / sizeof( double ); j++ )
                 {
-                    double test = ((double*) q)[j];
-                    long double correct = f->dfunc.f_fff( s[j], s2[j], s3[j] );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    int fail = ! (fabsf(err) <= f->double_ulps);
-
-                    if( fail && ftz )
+                    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
                     {
-                        // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps) )
-                        { // look at me,
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
-                        }
+                        uint64_t *q = gOut[k];
 
-                        // retry per section 6.5.3.3
-                        if( fail && IsDoubleSubnormal( s[j] ) )
-                        { // look at me,
-                            long double correct2 = f->dfunc.f_fff( 0.0, s2[j], s3[j] );
-                            long double correct3 = f->dfunc.f_fff( -0.0, s2[j], s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
-                            { // look at me now,
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
+                        // If we aren't getting the correctly rounded result
+                        if( t[j] != q[j] )
+                        {
+                            double test = ((double*) q)[j];
+                            long double correct = f->dfunc.f_fff( s[j], s2[j],
+         s3[j] ); float err = Bruteforce_Ulp_Error_Double( test, correct ); int
+         fail = ! (fabsf(err) <= f->double_ulps);
 
-                            //try with first two args as zero
-                            if( IsDoubleSubnormal( s2[j] ) )
-                            { // its fun to have fun,
-                                correct2 = f->dfunc.f_fff( 0.0, 0.0, s3[j] );
-                                correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] );
-                                long double correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] );
-                                long double correct5 = f->dfunc.f_fff( -0.0, -0.0, s3[j] );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                    IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                            if( fail && ftz )
+                            {
+                                // retry per section 6.5.3.2
+                                if( IsDoubleResultSubnormal(correct,
+         f->double_ulps) ) { // look at me, fail = fail && ( test != 0.0f ); if(
+         ! fail ) err = 0.0f;
                                 }
 
-                                if( IsDoubleSubnormal( s3[j] )  )
-                                { // but you have to know how!
-                                    correct2 = f->dfunc.f_fff( 0.0, 0.0, 0.0f );
-                                    correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f );
-                                    correct4 = f->dfunc.f_fff( 0.0, -0.0, 0.0f );
-                                    correct5 = f->dfunc.f_fff( -0.0, -0.0, 0.0f );
-                                    long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f );
-                                    long double correct7 = f->dfunc.f_fff( -0.0, 0.0, -0.0f );
-                                    long double correct8 = f->dfunc.f_fff( 0.0, -0.0, -0.0f );
-                                    long double correct9 = f->dfunc.f_fff( -0.0, -0.0, -0.0f );
-                                    err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                    err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                    err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                    err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                    float err6 = Bruteforce_Ulp_Error_Double( test, correct6  );
-                                    float err7 = Bruteforce_Ulp_Error_Double( test, correct7  );
-                                    float err8 = Bruteforce_Ulp_Error_Double( test, correct8  );
-                                    float err9 = Bruteforce_Ulp_Error_Double( test, correct9  );
-                                    fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                     (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) &&
-                                                     (!(fabsf(err5) <= f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) &&
-                                                     (!(fabsf(err7) <= f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
-                                    if( fabsf( err6 ) < fabsf(err ) )
-                                        err = err6;
-                                    if( fabsf( err7 ) < fabsf(err ) )
-                                        err = err7;
-                                    if( fabsf( err8 ) < fabsf(err ) )
-                                        err = err8;
-                                    if( fabsf( err9 ) < fabsf(err ) )
-                                        err = err9;
+                                // retry per section 6.5.3.3
+                                if( fail && IsDoubleSubnormal( s[j] ) )
+                                { // look at me,
+                                    long double correct2 = f->dfunc.f_fff( 0.0,
+         s2[j], s3[j] ); long double correct3 = f->dfunc.f_fff( -0.0, s2[j],
+         s3[j] ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
+                                    float err3 = Bruteforce_Ulp_Error_Double(
+         test, correct3  ); fail =  fail && ((!(fabsf(err2) <= f->double_ulps))
+         && (!(fabsf(err3) <= f->double_ulps))); if( fabsf( err2 ) < fabsf(err )
+         ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = err3;
 
                                     // retry per section 6.5.3.4
-                                    if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                        IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps )  ||
-                                        IsDoubleResultSubnormal( correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7, f->double_ulps )  ||
-                                        IsDoubleResultSubnormal( correct8, f->double_ulps ) || IsDoubleResultSubnormal( correct9, f->double_ulps )  )
+                                    if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         ) { // look at me now, fail = fail && ( test != 0.0f); if( ! fail ) err
+         = 0.0f;
+                                    }
+
+                                    //try with first two args as zero
+                                    if( IsDoubleSubnormal( s2[j] ) )
+                                    { // its fun to have fun,
+                                        correct2 = f->dfunc.f_fff( 0.0, 0.0,
+         s3[j] ); correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] ); long double
+         correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] ); long double correct5 =
+         f->dfunc.f_fff( -0.0, -0.0, s3[j] ); err2 =
+         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
+         Bruteforce_Ulp_Error_Double( test, correct3  ); float err4 =
+         Bruteforce_Ulp_Error_Double( test, correct4  ); float err5 =
+         Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                            err = err3;
+                                        if( fabsf( err4 ) < fabsf(err ) )
+                                            err = err4;
+                                        if( fabsf( err5 ) < fabsf(err ) )
+                                            err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
+         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+
+                                        if( IsDoubleSubnormal( s3[j] )  )
+                                        { // but you have to know how!
+                                            correct2 = f->dfunc.f_fff( 0.0, 0.0,
+         0.0f ); correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f ); correct4 =
+         f->dfunc.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->dfunc.f_fff( -0.0,
+         -0.0, 0.0f ); long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f );
+                                            long double correct7 =
+         f->dfunc.f_fff( -0.0, 0.0, -0.0f ); long double correct8 =
+         f->dfunc.f_fff( 0.0, -0.0, -0.0f ); long double correct9 =
+         f->dfunc.f_fff( -0.0, -0.0, -0.0f ); err2 =
+         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
+         Bruteforce_Ulp_Error_Double( test, correct3  ); err4 =
+         Bruteforce_Ulp_Error_Double( test, correct4  ); err5 =
+         Bruteforce_Ulp_Error_Double( test, correct5  ); float err6 =
+         Bruteforce_Ulp_Error_Double( test, correct6  ); float err7 =
+         Bruteforce_Ulp_Error_Double( test, correct7  ); float err8 =
+         Bruteforce_Ulp_Error_Double( test, correct8  ); float err9 =
+         Bruteforce_Ulp_Error_Double( test, correct9  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                             (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) &&
+                                                             (!(fabsf(err5) <=
+         f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) &&
+                                                             (!(fabsf(err7) <=
+         f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                                err = err3;
+                                            if( fabsf( err4 ) < fabsf(err ) )
+                                                err = err4;
+                                            if( fabsf( err5 ) < fabsf(err ) )
+                                                err = err5;
+                                            if( fabsf( err6 ) < fabsf(err ) )
+                                                err = err6;
+                                            if( fabsf( err7 ) < fabsf(err ) )
+                                                err = err7;
+                                            if( fabsf( err8 ) < fabsf(err ) )
+                                                err = err8;
+                                            if( fabsf( err9 ) < fabsf(err ) )
+                                                err = err9;
+
+                                            // retry per section 6.5.3.4
+                                            if( IsDoubleResultSubnormal(
+         correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3,
+         f->double_ulps )  || IsDoubleResultSubnormal( correct4, f->double_ulps
+         ) || IsDoubleResultSubnormal( correct5, f->double_ulps )  ||
+                                                IsDoubleResultSubnormal(
+         correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7,
+         f->double_ulps )  || IsDoubleResultSubnormal( correct8, f->double_ulps
+         ) || IsDoubleResultSubnormal( correct9, f->double_ulps )  )
+                                            {
+                                                fail = fail && ( test != 0.0f);
+                                                if( ! fail )
+                                                    err = 0.0f;
+                                            }
+                                        }
+                                    }
+                                    else if( IsDoubleSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->dfunc.f_fff( 0.0, s2[j],
+         0.0 ); correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 ); long double
+         correct4 = f->dfunc.f_fff( 0.0,  s2[j], -0.0 ); long double correct5 =
+         f->dfunc.f_fff( -0.0, s2[j], -0.0 ); err2 =
+         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
+         Bruteforce_Ulp_Error_Double( test, correct3  ); float err4 =
+         Bruteforce_Ulp_Error_Double( test, correct4  ); float err5 =
+         Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                            err = err3;
+                                        if( fabsf( err4 ) < fabsf(err ) )
+                                            err = err4;
+                                        if( fabsf( err5 ) < fabsf(err ) )
+                                            err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
+         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
+                                }
+                                else if( fail && IsDoubleSubnormal( s2[j] ) )
+                                {
+                                    long double correct2 = f->dfunc.f_fff( s[j],
+         0.0, s3[j] ); long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j]
+         ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  ); float
+         err3 = Bruteforce_Ulp_Error_Double( test, correct3  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if(
+         fabsf( err3 ) < fabsf(err ) ) err = err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps )  || IsDoubleResultSubnormal( correct3, f->double_ulps
+         ) )
                                     {
                                         fail = fail && ( test != 0.0f);
                                         if( ! fail )
                                             err = 0.0f;
                                     }
+
+                                    //try with second two args as zero
+                                    if( IsDoubleSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->dfunc.f_fff( s[j], 0.0,
+         0.0 ); correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 ); long double
+         correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 ); long double correct5 =
+         f->dfunc.f_fff( s[j], -0.0, -0.0 ); err2 = Bruteforce_Ulp_Error_Double(
+         test, correct2  ); err3 = Bruteforce_Ulp_Error_Double( test, correct3
+         ); float err4 = Bruteforce_Ulp_Error_Double( test, correct4  ); float
+         err5 = Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                            err = err3;
+                                        if( fabsf( err4 ) < fabsf(err ) )
+                                            err = err4;
+                                        if( fabsf( err5 ) < fabsf(err ) )
+                                            err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
+         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
                                 }
-                            }
-                            else if( IsDoubleSubnormal( s3[j] ) )
-                            {
-                                correct2 = f->dfunc.f_fff( 0.0, s2[j], 0.0 );
-                                correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 );
-                                long double correct4 = f->dfunc.f_fff( 0.0,  s2[j], -0.0 );
-                                long double correct5 = f->dfunc.f_fff( -0.0, s2[j], -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                    IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                else if( fail && IsDoubleSubnormal(s3[j]) )
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    long double correct2 = f->dfunc.f_fff( s[j],
+         s2[j], 0.0 ); long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0
+         ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  ); float
+         err3 = Bruteforce_Ulp_Error_Double( test, correct3  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if(
+         fabsf( err3 ) < fabsf(err ) ) err = err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         )
+                                    {
+                                        fail = fail && ( test != 0.0f);
+                                        if( ! fail )
+                                            err = 0.0f;
+                                    }
                                 }
                             }
-                        }
-                        else if( fail && IsDoubleSubnormal( s2[j] ) )
-                        {
-                            long double correct2 = f->dfunc.f_fff( s[j], 0.0, s3[j] );
-                            long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps )  || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
-                            {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
 
-                            //try with second two args as zero
-                            if( IsDoubleSubnormal( s3[j] ) )
+                            if( fabsf(err ) > maxError )
                             {
-                                correct2 = f->dfunc.f_fff( s[j], 0.0, 0.0 );
-                                correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 );
-                                long double correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 );
-                                long double correct5 = f->dfunc.f_fff( s[j], -0.0, -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                    IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
-                                }
+                                maxError = fabsf(err);
+                                maxErrorVal = s[j];
+                                maxErrorVal2 = s2[j];
+                                maxErrorVal3 = s3[j];
                             }
-                        }
-                        else if( fail && IsDoubleSubnormal(s3[j]) )
-                        {
-                            long double correct2 = f->dfunc.f_fff( s[j], s2[j], 0.0 );
-                            long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+
+                            if( fail )
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                vlog_error( "\nERROR: %sD%s: %f ulp error at
+         {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j],
+         s3[j], ((double*) gOut_Ref)[j], test ); error = -1; goto exit;
                             }
                         }
                     }
-
-                    if( fabsf(err ) > maxError )
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                        maxErrorVal2 = s2[j];
-                        maxErrorVal3 = s3[j];
-                    }
-
-                    if( fail )
-                    {
-                        vlog_error( "\nERROR: %sD%s: %f ulp error at {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((double*) gOut_Ref)[j], test );
- error = -1;
- goto exit;
-                    }
                 }
-            }
-        }
-*/
-        if( 0 == (i & 0x0fffffff) )
+        */
+        if (0 == (i & 0x0fffffff))
         {
-            vlog("." );
+            vlog(".");
             fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "pass" );
+            vlog("pass");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -1125,6 +1282,3 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index d7f2ebf67d..ca58f2e5fc 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -27,116 +27,122 @@
 #include "harness/parseParameters.h"
 #include "harness/typeWrappers.h"
 
-#if defined( __APPLE__ )
-    #include <sys/sysctl.h>
-    #include <sys/mman.h>
-    #include <libgen.h>
-    #include <sys/time.h>
-#elif defined( __linux__ )
-    #include <unistd.h>
-    #include <sys/syscall.h>
-    #include <linux/sysctl.h>
-    #include <sys/param.h>
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/mman.h>
+#include <libgen.h>
+#include <sys/time.h>
+#elif defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/sysctl.h>
+#include <sys/param.h>
 #endif
 
-#if defined (__linux__) || (defined WIN32 && defined __MINGW32__)
+#if defined(__linux__) || (defined WIN32 && defined __MINGW32__)
 #include <sys/param.h>
 #endif
 
 #include "harness/testHarness.h"
 
-#define kPageSize           4096
-#define DOUBLE_REQUIRED_FEATURES    ( CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM  )
+#define kPageSize 4096
+#define DOUBLE_REQUIRED_FEATURES                                               \
+    (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO                  \
+     | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)
 
-const char      **gTestNames = NULL;
-unsigned int    gTestNameCount = 0;
-char            appName[ MAXPATHLEN ] = "";
-cl_device_id    gDevice = NULL;
-cl_context      gContext = NULL;
+const char **gTestNames = NULL;
+unsigned int gTestNameCount = 0;
+char appName[MAXPATHLEN] = "";
+cl_device_id gDevice = NULL;
+cl_context gContext = NULL;
 cl_command_queue gQueue = NULL;
-static int32_t  gStartTestNumber;
-static int32_t  gEndTestNumber;
-int             gSkipCorrectnessTesting = 0;
-int             gStopOnError = 0;
-static bool     gSkipRestOfTests;
-#if defined( __APPLE__ )
-int             gMeasureTimes = 1;
+static int32_t gStartTestNumber;
+static int32_t gEndTestNumber;
+int gSkipCorrectnessTesting = 0;
+int gStopOnError = 0;
+static bool gSkipRestOfTests;
+#if defined(__APPLE__)
+int gMeasureTimes = 1;
 #else
-int             gMeasureTimes = 0;
+int gMeasureTimes = 0;
 #endif
-int             gReportAverageTimes = 0;
-int             gForceFTZ = 0;
-int             gWimpyMode = 0;
-int             gHasDouble = 0;
-int             gTestFloat = 1;
+int gReportAverageTimes = 0;
+int gForceFTZ = 0;
+int gWimpyMode = 0;
+int gHasDouble = 0;
+int gTestFloat = 1;
 // This flag should be 'ON' by default and it can be changed through the command
 // line arguments.
 static int gTestFastRelaxed = 1;
-/*This flag corresponds to defining if the implementation has Derived Fast Relaxed functions.
-  The spec does not specify ULP for derived function.  The derived functions are composed of base functions which are tested for ULP, thus when this flag is enabled,
-  Derived functions will not be tested for ULP, as per table 7.1 of OpenCL 2.0 spec.
-  Since there is no way of quering the device whether it is a derived or non-derived implementation according to OpenCL 2.0 spec then it has to be changed through a command line argument.
+/*This flag corresponds to defining if the implementation has Derived Fast
+  Relaxed functions. The spec does not specify ULP for derived function.  The
+  derived functions are composed of base functions which are tested for ULP,
+  thus when this flag is enabled, Derived functions will not be tested for ULP,
+  as per table 7.1 of OpenCL 2.0 spec. Since there is no way of quering the
+  device whether it is a derived or non-derived implementation according to
+  OpenCL 2.0 spec then it has to be changed through a command line argument.
 */
-int             gFastRelaxedDerived = 1;
-int             gToggleCorrectlyRoundedDivideSqrt = 0;
-int             gDeviceILogb0 = 1;
-int             gDeviceILogbNaN = 1;
-int             gCheckTininessBeforeRounding = 1;
-int             gIsInRTZMode = 0;
-uint32_t        gMaxVectorSizeIndex = VECTOR_SIZE_COUNT;
-uint32_t        gMinVectorSizeIndex = 0;
-const char      *method[] = { "Best", "Average" };
-void            *gIn = NULL;
-void            *gIn2 = NULL;
-void            *gIn3 = NULL;
-void            *gOut_Ref = NULL;
-void            *gOut[VECTOR_SIZE_COUNT] = {NULL, NULL, NULL, NULL, NULL, NULL };
-void            *gOut_Ref2 = NULL;
-void            *gOut2[VECTOR_SIZE_COUNT] = {NULL, NULL, NULL, NULL, NULL, NULL };
-cl_mem          gInBuffer = NULL;
-cl_mem          gInBuffer2 = NULL;
-cl_mem          gInBuffer3 = NULL;
-cl_mem          gOutBuffer[VECTOR_SIZE_COUNT]= {NULL, NULL, NULL, NULL, NULL, NULL };
-cl_mem          gOutBuffer2[VECTOR_SIZE_COUNT]= {NULL, NULL, NULL, NULL, NULL, NULL };
-uint32_t        gComputeDevices = 0;
-uint32_t        gSimdSize = 1;
-uint32_t        gDeviceFrequency = 0;
-static MTdata   gMTdata;
+int gFastRelaxedDerived = 1;
+int gToggleCorrectlyRoundedDivideSqrt = 0;
+int gDeviceILogb0 = 1;
+int gDeviceILogbNaN = 1;
+int gCheckTininessBeforeRounding = 1;
+int gIsInRTZMode = 0;
+uint32_t gMaxVectorSizeIndex = VECTOR_SIZE_COUNT;
+uint32_t gMinVectorSizeIndex = 0;
+const char *method[] = { "Best", "Average" };
+void *gIn = NULL;
+void *gIn2 = NULL;
+void *gIn3 = NULL;
+void *gOut_Ref = NULL;
+void *gOut[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+void *gOut_Ref2 = NULL;
+void *gOut2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+cl_mem gInBuffer = NULL;
+cl_mem gInBuffer2 = NULL;
+cl_mem gInBuffer3 = NULL;
+cl_mem gOutBuffer[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+cl_mem gOutBuffer2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+uint32_t gComputeDevices = 0;
+uint32_t gSimdSize = 1;
+uint32_t gDeviceFrequency = 0;
+static MTdata gMTdata;
 cl_device_fp_config gFloatCapabilities = 0;
 cl_device_fp_config gDoubleCapabilities = 0;
-int             gWimpyReductionFactor = 32;
-int             gWimpyBufferSize = BUFFER_SIZE;
-int             gVerboseBruteForce = 0;
+int gWimpyReductionFactor = 32;
+int gWimpyBufferSize = BUFFER_SIZE;
+int gVerboseBruteForce = 0;
 
-static int ParseArgs( int argc, const char **argv );
-static void PrintUsage( void );
-static void PrintFunctions( void );
-test_status InitCL( cl_device_id device );
-static void ReleaseCL( void );
-static int InitILogbConstants( void );
-static int IsTininessDetectedBeforeRounding( void );
-static int IsInRTZMode( void );         //expensive. Please check gIsInRTZMode global instead.
+static int ParseArgs(int argc, const char **argv);
+static void PrintUsage(void);
+static void PrintFunctions(void);
+test_status InitCL(cl_device_id device);
+static void ReleaseCL(void);
+static int InitILogbConstants(void);
+static int IsTininessDetectedBeforeRounding(void);
+static int
+IsInRTZMode(void); // expensive. Please check gIsInRTZMode global instead.
 
 
-int doTest( const char* name )
+int doTest(const char *name)
 {
-    if( gSkipRestOfTests )
+    if (gSkipRestOfTests)
     {
-        vlog( "Skipping function because of an earlier error.\n" );
+        vlog("Skipping function because of an earlier error.\n");
         return 1;
     }
 
     int error = 0;
-    const Func* func_data = NULL;
+    const Func *func_data = NULL;
 
-    for( size_t i = 0; i < functionListCount; i++ )
+    for (size_t i = 0; i < functionListCount; i++)
     {
-        const Func* const temp_func = functionList + i;
-        if( strcmp( temp_func->name, name ) == 0 )
+        const Func *const temp_func = functionList + i;
+        if (strcmp(temp_func->name, name) == 0)
         {
-            if( i < gStartTestNumber || i > gEndTestNumber )
+            if (i < gStartTestNumber || i > gEndTestNumber)
             {
-                vlog( "Skipping function #%d\n", i );
+                vlog("Skipping function #%d\n", i);
                 return 0;
             }
 
@@ -145,32 +151,35 @@ int doTest( const char* name )
         }
     }
 
-    if( func_data == NULL )
+    if (func_data == NULL)
     {
-        vlog( "Function '%s' doesn't exist!\n", name );
-        exit( EXIT_FAILURE );
+        vlog("Function '%s' doesn't exist!\n", name);
+        exit(EXIT_FAILURE);
     }
 
-    if( func_data->func.p == NULL )
+    if (func_data->func.p == NULL)
     {
-        vlog( "'%s' is missing implementation, skipping function.\n", func_data->name );
+        vlog("'%s' is missing implementation, skipping function.\n",
+             func_data->name);
         return 0;
     }
 
     // if correctly rounded divide & sqrt are supported by the implementation
     // then test it; otherwise skip the test
-    if( strcmp( func_data->name, "sqrt_cr" ) == 0 || strcmp( func_data->name, "divide_cr" ) == 0 )
+    if (strcmp(func_data->name, "sqrt_cr") == 0
+        || strcmp(func_data->name, "divide_cr") == 0)
     {
-        if( ( gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT ) == 0 )
+        if ((gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT) == 0)
         {
-            vlog( "Correctly rounded divide and sqrt are not supported, skipping function.\n" );
+            vlog("Correctly rounded divide and sqrt are not supported, "
+                 "skipping function.\n");
             return 0;
         }
     }
 
     {
         extern int my_ilogb(double);
-        if( 0 == strcmp( "ilogb", func_data->name ) )
+        if (0 == strcmp("ilogb", func_data->name))
         {
             InitILogbConstants();
         }
@@ -201,17 +210,17 @@ int doTest( const char* name )
             }
         }
 
-        if( gTestFloat )
+        if (gTestFloat)
         {
             gTestCount++;
-            vlog( "%3d: ", gTestCount );
+            vlog("%3d: ", gTestCount);
             // Don't test with relaxed requirements.
             if (func_data->vtbl_ptr->TestFunc(func_data, gMTdata,
                                               false /* relaxed mode */))
             {
                 gFailCount++;
                 error++;
-                if( gStopOnError )
+                if (gStopOnError)
                 {
                     gSkipRestOfTests = true;
                     return error;
@@ -219,17 +228,18 @@ int doTest( const char* name )
             }
         }
 
-        if( gHasDouble && NULL != func_data->vtbl_ptr->DoubleTestFunc && NULL != func_data->dfunc.p )
+        if (gHasDouble && NULL != func_data->vtbl_ptr->DoubleTestFunc
+            && NULL != func_data->dfunc.p)
         {
             gTestCount++;
-            vlog( "%3d: ", gTestCount );
+            vlog("%3d: ", gTestCount);
             // Don't test with relaxed requirements.
             if (func_data->vtbl_ptr->DoubleTestFunc(func_data, gMTdata,
                                                     false /* relaxed mode*/))
             {
                 gFailCount++;
                 error++;
-                if( gStopOnError )
+                if (gStopOnError)
                 {
                     gSkipRestOfTests = true;
                     return error;
@@ -241,515 +251,549 @@ int doTest( const char* name )
     return error;
 }
 
-int test_acos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_acos(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "acos" );
+    return doTest("acos");
 }
-int test_acosh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_acosh(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "acosh" );
+    return doTest("acosh");
 }
-int test_acospi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_acospi(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "acospi" );
+    return doTest("acospi");
 }
-int test_asin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_asin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "asin" );
+    return doTest("asin");
 }
-int test_asinh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_asinh(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "asinh" );
+    return doTest("asinh");
 }
-int test_asinpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_asinpi(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "asinpi" );
+    return doTest("asinpi");
 }
-int test_atan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "atan" );
+    return doTest("atan");
 }
-int test_atanh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atanh(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "atanh" );
+    return doTest("atanh");
 }
-int test_atanpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atanpi(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "atanpi" );
+    return doTest("atanpi");
 }
-int test_atan2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atan2(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "atan2" );
+    return doTest("atan2");
 }
-int test_atan2pi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atan2pi(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "atan2pi" );
+    return doTest("atan2pi");
 }
-int test_cbrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cbrt(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "cbrt" );
+    return doTest("cbrt");
 }
-int test_ceil( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_ceil(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "ceil" );
+    return doTest("ceil");
 }
-int test_copysign( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_copysign(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "copysign" );
+    return doTest("copysign");
 }
-int test_cos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cos(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "cos" );
+    return doTest("cos");
 }
-int test_cosh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cosh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "cosh" );
+    return doTest("cosh");
 }
-int test_cospi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cospi(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "cospi" );
+    return doTest("cospi");
 }
-int test_exp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_exp(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "exp" );
+    return doTest("exp");
 }
-int test_exp2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_exp2(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "exp2" );
+    return doTest("exp2");
 }
-int test_exp10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_exp10(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "exp10" );
+    return doTest("exp10");
 }
-int test_expm1( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_expm1(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "expm1" );
+    return doTest("expm1");
 }
-int test_fabs( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fabs(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fabs" );
+    return doTest("fabs");
 }
-int test_fdim( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fdim(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fdim" );
+    return doTest("fdim");
 }
-int test_floor( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_floor(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "floor" );
+    return doTest("floor");
 }
-int test_fma( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fma(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "fma" );
+    return doTest("fma");
 }
-int test_fmax( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fmax(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fmax" );
+    return doTest("fmax");
 }
-int test_fmin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fmin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fmin" );
+    return doTest("fmin");
 }
-int test_fmod( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fmod(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fmod" );
+    return doTest("fmod");
 }
-int test_fract( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fract(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "fract" );
+    return doTest("fract");
 }
-int test_frexp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_frexp(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "frexp" );
+    return doTest("frexp");
 }
-int test_hypot( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_hypot(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "hypot" );
+    return doTest("hypot");
 }
-int test_ilogb( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_ilogb(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "ilogb" );
+    return doTest("ilogb");
 }
-int test_isequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isequal(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "isequal" );
+    return doTest("isequal");
 }
-int test_isfinite( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isfinite(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "isfinite" );
+    return doTest("isfinite");
 }
-int test_isgreater( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isgreater(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "isgreater" );
+    return doTest("isgreater");
 }
-int test_isgreaterequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isgreaterequal(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
 {
-    return doTest( "isgreaterequal" );
+    return doTest("isgreaterequal");
 }
-int test_isinf( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isinf(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "isinf" );
+    return doTest("isinf");
 }
-int test_isless( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isless(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "isless" );
+    return doTest("isless");
 }
-int test_islessequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_islessequal(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
-    return doTest( "islessequal" );
+    return doTest("islessequal");
 }
-int test_islessgreater( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_islessgreater(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements)
 {
-    return doTest( "islessgreater" );
+    return doTest("islessgreater");
 }
-int test_isnan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isnan(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "isnan" );
+    return doTest("isnan");
 }
-int test_isnormal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isnormal(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "isnormal" );
+    return doTest("isnormal");
 }
-int test_isnotequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isnotequal(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "isnotequal" );
+    return doTest("isnotequal");
 }
-int test_isordered( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isordered(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "isordered" );
+    return doTest("isordered");
 }
-int test_isunordered( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isunordered(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
-    return doTest( "isunordered" );
+    return doTest("isunordered");
 }
-int test_ldexp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_ldexp(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "ldexp" );
+    return doTest("ldexp");
 }
-int test_lgamma( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_lgamma(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "lgamma" );
+    return doTest("lgamma");
 }
-int test_lgamma_r( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_lgamma_r(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "lgamma_r" );
+    return doTest("lgamma_r");
 }
-int test_log( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "log" );
+    return doTest("log");
 }
-int test_log2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log2(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "log2" );
+    return doTest("log2");
 }
-int test_log10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log10(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "log10" );
+    return doTest("log10");
 }
-int test_log1p( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log1p(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "log1p" );
+    return doTest("log1p");
 }
-int test_logb( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_logb(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "logb" );
+    return doTest("logb");
 }
-int test_mad( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_mad(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "mad" );
+    return doTest("mad");
 }
-int test_maxmag( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_maxmag(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "maxmag" );
+    return doTest("maxmag");
 }
-int test_minmag( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_minmag(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "minmag" );
+    return doTest("minmag");
 }
-int test_modf( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_modf(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "modf" );
+    return doTest("modf");
 }
-int test_nan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_nan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "nan" );
+    return doTest("nan");
 }
-int test_nextafter( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_nextafter(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "nextafter" );
+    return doTest("nextafter");
 }
-int test_pow( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_pow(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "pow" );
+    return doTest("pow");
 }
-int test_pown( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_pown(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "pown" );
+    return doTest("pown");
 }
-int test_powr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_powr(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "powr" );
+    return doTest("powr");
 }
-int test_remainder( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_remainder(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "remainder" );
+    return doTest("remainder");
 }
-int test_remquo( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_remquo(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "remquo" );
+    return doTest("remquo");
 }
-int test_rint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_rint(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "rint" );
+    return doTest("rint");
 }
-int test_rootn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_rootn(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "rootn" );
+    return doTest("rootn");
 }
-int test_round( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_round(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "round" );
+    return doTest("round");
 }
-int test_rsqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_rsqrt(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "rsqrt" );
+    return doTest("rsqrt");
 }
-int test_signbit( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_signbit(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "signbit" );
+    return doTest("signbit");
 }
-int test_sin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "sin" );
+    return doTest("sin");
 }
-int test_sincos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sincos(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "sincos" );
+    return doTest("sincos");
 }
-int test_sinh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sinh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "sinh" );
+    return doTest("sinh");
 }
-int test_sinpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sinpi(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "sinpi" );
+    return doTest("sinpi");
 }
-int test_sqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sqrt(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "sqrt" );
+    return doTest("sqrt");
 }
-int test_sqrt_cr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sqrt_cr(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "sqrt_cr" );
+    return doTest("sqrt_cr");
 }
-int test_tan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_tan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "tan" );
+    return doTest("tan");
 }
-int test_tanh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_tanh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "tanh" );
+    return doTest("tanh");
 }
-int test_tanpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_tanpi(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "tanpi" );
+    return doTest("tanpi");
 }
-int test_trunc( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_trunc(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "trunc" );
+    return doTest("trunc");
 }
-int test_half_cos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_cos(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_cos" );
+    return doTest("half_cos");
 }
-int test_half_divide( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_divide(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_divide" );
+    return doTest("half_divide");
 }
-int test_half_exp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_exp(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_exp" );
+    return doTest("half_exp");
 }
-int test_half_exp2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_exp2(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_exp2" );
+    return doTest("half_exp2");
 }
-int test_half_exp10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_exp10(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_exp10" );
+    return doTest("half_exp10");
 }
-int test_half_log( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_log(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_log" );
+    return doTest("half_log");
 }
-int test_half_log2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_log2(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_log2" );
+    return doTest("half_log2");
 }
-int test_half_log10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_log10(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_log10" );
+    return doTest("half_log10");
 }
-int test_half_powr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_powr(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_powr" );
+    return doTest("half_powr");
 }
-int test_half_recip( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_recip(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_recip" );
+    return doTest("half_recip");
 }
-int test_half_rsqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_rsqrt(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_rsqrt" );
+    return doTest("half_rsqrt");
 }
-int test_half_sin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_sin(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_sin" );
+    return doTest("half_sin");
 }
-int test_half_sqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_sqrt(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_sqrt" );
+    return doTest("half_sqrt");
 }
-int test_half_tan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_tan(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_tan" );
+    return doTest("half_tan");
 }
-int test_add( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_add(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "add" );
+    return doTest("add");
 }
-int test_subtract( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_subtract(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "subtract" );
+    return doTest("subtract");
 }
-int test_divide( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_divide(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "divide" );
+    return doTest("divide");
 }
-int test_divide_cr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_divide_cr(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "divide_cr" );
+    return doTest("divide_cr");
 }
-int test_multiply( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_multiply(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "multiply" );
+    return doTest("multiply");
 }
-int test_assignment( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_assignment(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "assignment" );
+    return doTest("assignment");
 }
-int test_not( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_not(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "not" );
+    return doTest("not");
 }
 
 test_definition test_list[] = {
-    ADD_TEST( acos ),
-    ADD_TEST( acosh ),
-    ADD_TEST( acospi ),
-    ADD_TEST( asin ),
-    ADD_TEST( asinh ),
-    ADD_TEST( asinpi ),
-    ADD_TEST( atan ),
-    ADD_TEST( atanh ),
-    ADD_TEST( atanpi ),
-    ADD_TEST( atan2 ),
-    ADD_TEST( atan2pi ),
-    ADD_TEST( cbrt ),
-    ADD_TEST( ceil ),
-    ADD_TEST( copysign ),
-    ADD_TEST( cos ),
-    ADD_TEST( cosh ),
-    ADD_TEST( cospi ),
-    ADD_TEST( exp ),
-    ADD_TEST( exp2 ),
-    ADD_TEST( exp10 ),
-    ADD_TEST( expm1 ),
-    ADD_TEST( fabs ),
-    ADD_TEST( fdim ),
-    ADD_TEST( floor ),
-    ADD_TEST( fma ),
-    ADD_TEST( fmax ),
-    ADD_TEST( fmin ),
-    ADD_TEST( fmod ),
-    ADD_TEST( fract ),
-    ADD_TEST( frexp ),
-    ADD_TEST( hypot ),
-    ADD_TEST( ilogb ),
-    ADD_TEST( isequal ),
-    ADD_TEST( isfinite ),
-    ADD_TEST( isgreater ),
-    ADD_TEST( isgreaterequal ),
-    ADD_TEST( isinf ),
-    ADD_TEST( isless ),
-    ADD_TEST( islessequal ),
-    ADD_TEST( islessgreater ),
-    ADD_TEST( isnan ),
-    ADD_TEST( isnormal ),
-    ADD_TEST( isnotequal ),
-    ADD_TEST( isordered ),
-    ADD_TEST( isunordered ),
-    ADD_TEST( ldexp ),
-    ADD_TEST( lgamma ),
-    ADD_TEST( lgamma_r ),
-    ADD_TEST( log ),
-    ADD_TEST( log2 ),
-    ADD_TEST( log10 ),
-    ADD_TEST( log1p ),
-    ADD_TEST( logb ),
-    ADD_TEST( mad ),
-    ADD_TEST( maxmag ),
-    ADD_TEST( minmag ),
-    ADD_TEST( modf ),
-    ADD_TEST( nan ),
-    ADD_TEST( nextafter ),
-    ADD_TEST( pow ),
-    ADD_TEST( pown ),
-    ADD_TEST( powr ),
-    ADD_TEST( remainder ),
-    ADD_TEST( remquo ),
-    ADD_TEST( rint ),
-    ADD_TEST( rootn ),
-    ADD_TEST( round ),
-    ADD_TEST( rsqrt ),
-    ADD_TEST( signbit ),
-    ADD_TEST( sin ),
-    ADD_TEST( sincos ),
-    ADD_TEST( sinh ),
-    ADD_TEST( sinpi ),
-    ADD_TEST( sqrt ),
-    ADD_TEST( sqrt_cr ),
-    ADD_TEST( tan ),
-    ADD_TEST( tanh ),
-    ADD_TEST( tanpi ),
-    ADD_TEST( trunc ),
-    ADD_TEST( half_cos ),
-    ADD_TEST( half_divide ),
-    ADD_TEST( half_exp ),
-    ADD_TEST( half_exp2 ),
-    ADD_TEST( half_exp10 ),
-    ADD_TEST( half_log ),
-    ADD_TEST( half_log2 ),
-    ADD_TEST( half_log10 ),
-    ADD_TEST( half_powr ),
-    ADD_TEST( half_recip ),
-    ADD_TEST( half_rsqrt ),
-    ADD_TEST( half_sin ),
-    ADD_TEST( half_sqrt ),
-    ADD_TEST( half_tan ),
-    ADD_TEST( add ),
-    ADD_TEST( subtract ),
-    ADD_TEST( divide ),
-    ADD_TEST( divide_cr ),
-    ADD_TEST( multiply ),
-    ADD_TEST( assignment ),
-    ADD_TEST( not ),
+    ADD_TEST(acos),          ADD_TEST(acosh),      ADD_TEST(acospi),
+    ADD_TEST(asin),          ADD_TEST(asinh),      ADD_TEST(asinpi),
+    ADD_TEST(atan),          ADD_TEST(atanh),      ADD_TEST(atanpi),
+    ADD_TEST(atan2),         ADD_TEST(atan2pi),    ADD_TEST(cbrt),
+    ADD_TEST(ceil),          ADD_TEST(copysign),   ADD_TEST(cos),
+    ADD_TEST(cosh),          ADD_TEST(cospi),      ADD_TEST(exp),
+    ADD_TEST(exp2),          ADD_TEST(exp10),      ADD_TEST(expm1),
+    ADD_TEST(fabs),          ADD_TEST(fdim),       ADD_TEST(floor),
+    ADD_TEST(fma),           ADD_TEST(fmax),       ADD_TEST(fmin),
+    ADD_TEST(fmod),          ADD_TEST(fract),      ADD_TEST(frexp),
+    ADD_TEST(hypot),         ADD_TEST(ilogb),      ADD_TEST(isequal),
+    ADD_TEST(isfinite),      ADD_TEST(isgreater),  ADD_TEST(isgreaterequal),
+    ADD_TEST(isinf),         ADD_TEST(isless),     ADD_TEST(islessequal),
+    ADD_TEST(islessgreater), ADD_TEST(isnan),      ADD_TEST(isnormal),
+    ADD_TEST(isnotequal),    ADD_TEST(isordered),  ADD_TEST(isunordered),
+    ADD_TEST(ldexp),         ADD_TEST(lgamma),     ADD_TEST(lgamma_r),
+    ADD_TEST(log),           ADD_TEST(log2),       ADD_TEST(log10),
+    ADD_TEST(log1p),         ADD_TEST(logb),       ADD_TEST(mad),
+    ADD_TEST(maxmag),        ADD_TEST(minmag),     ADD_TEST(modf),
+    ADD_TEST(nan),           ADD_TEST(nextafter),  ADD_TEST(pow),
+    ADD_TEST(pown),          ADD_TEST(powr),       ADD_TEST(remainder),
+    ADD_TEST(remquo),        ADD_TEST(rint),       ADD_TEST(rootn),
+    ADD_TEST(round),         ADD_TEST(rsqrt),      ADD_TEST(signbit),
+    ADD_TEST(sin),           ADD_TEST(sincos),     ADD_TEST(sinh),
+    ADD_TEST(sinpi),         ADD_TEST(sqrt),       ADD_TEST(sqrt_cr),
+    ADD_TEST(tan),           ADD_TEST(tanh),       ADD_TEST(tanpi),
+    ADD_TEST(trunc),         ADD_TEST(half_cos),   ADD_TEST(half_divide),
+    ADD_TEST(half_exp),      ADD_TEST(half_exp2),  ADD_TEST(half_exp10),
+    ADD_TEST(half_log),      ADD_TEST(half_log2),  ADD_TEST(half_log10),
+    ADD_TEST(half_powr),     ADD_TEST(half_recip), ADD_TEST(half_rsqrt),
+    ADD_TEST(half_sin),      ADD_TEST(half_sqrt),  ADD_TEST(half_tan),
+    ADD_TEST(add),           ADD_TEST(subtract),   ADD_TEST(divide),
+    ADD_TEST(divide_cr),     ADD_TEST(multiply),   ADD_TEST(assignment),
+    ADD_TEST(not),
 };
 
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 
 #pragma mark -
 
-int main (int argc, const char * argv[])
+int main(int argc, const char *argv[])
 {
     int error;
 
@@ -759,60 +803,59 @@ int main (int argc, const char * argv[])
         return -1;
     }
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     struct timeval startTime;
-    gettimeofday( &startTime, NULL );
+    gettimeofday(&startTime, NULL);
 #endif
 
-    error = ParseArgs( argc, argv );
-    if( error )
-        return error;
+    error = ParseArgs(argc, argv);
+    if (error) return error;
 
     // This takes a while, so prevent the machine from going to sleep.
     PreventSleep();
-    atexit( ResumeSleep );
+    atexit(ResumeSleep);
 
-    if( gSkipCorrectnessTesting )
-        vlog( "*** Skipping correctness testing! ***\n\n" );
-    else if( gStopOnError )
-        vlog( "Stopping at first error.\n" );
+    if (gSkipCorrectnessTesting)
+        vlog("*** Skipping correctness testing! ***\n\n");
+    else if (gStopOnError)
+        vlog("Stopping at first error.\n");
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        vlog( "%s times are reported at right (cycles per element):\n", method[gReportAverageTimes] );
-        vlog( "\n" );
-        if( gSkipCorrectnessTesting )
-            vlog( "   \t               ");
+        vlog("%s times are reported at right (cycles per element):\n",
+             method[gReportAverageTimes]);
+        vlog("\n");
+        if (gSkipCorrectnessTesting)
+            vlog("   \t               ");
         else
-            vlog( "   \t                                        ");
-        if( gWimpyMode )
-            vlog( "   " );
-        for( int i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            vlog( "\t  float%s", sizeNames[i] );
+            vlog("   \t                                        ");
+        if (gWimpyMode) vlog("   ");
+        for (int i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+            vlog("\t  float%s", sizeNames[i]);
     }
     else
     {
-        vlog( "   \t                                        ");
-        if( gWimpyMode )
-            vlog( "   " );
+        vlog("   \t                                        ");
+        if (gWimpyMode) vlog("   ");
     }
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t  max_ulps" );
+    if (!gSkipCorrectnessTesting) vlog("\t  max_ulps");
 
-    vlog( "\n-----------------------------------------------------------------------------------------------------------\n" );
+    vlog("\n-------------------------------------------------------------------"
+         "----------------------------------------\n");
 
-    gMTdata = init_genrand( gRandomSeed );
-    if( gEndTestNumber == 0 )
+    gMTdata = init_genrand(gRandomSeed);
+    if (gEndTestNumber == 0)
     {
         gEndTestNumber = functionListCount;
     }
 
     FPU_mode_type oldMode;
-    DisableFTZ( &oldMode );
+    DisableFTZ(&oldMode);
 
-    int ret = runTestHarnessWithCheck( gTestNameCount, gTestNames, test_num, test_list, true, 0, InitCL );
+    int ret = runTestHarnessWithCheck(gTestNameCount, gTestNames, test_num,
+                                      test_list, true, 0, InitCL);
 
-    RestoreFPState( &oldMode );
+    RestoreFPState(&oldMode);
 
     free_mtdata(gMTdata);
     free(gTestNames);
@@ -825,24 +868,24 @@ int main (int argc, const char * argv[])
 
     ReleaseCL();
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     struct timeval endTime;
-    gettimeofday( &endTime, NULL );
-    double time = (double) endTime.tv_sec - (double) startTime.tv_sec;
-    time += 1e-6 * ((double) endTime.tv_usec - (double) startTime.tv_usec);
-    vlog( "time: %f s\n", time );
+    gettimeofday(&endTime, NULL);
+    double time = (double)endTime.tv_sec - (double)startTime.tv_sec;
+    time += 1e-6 * ((double)endTime.tv_usec - (double)startTime.tv_usec);
+    vlog("time: %f s\n", time);
 #endif
 
     return ret;
 }
 
-static int ParseArgs( int argc, const char **argv )
+static int ParseArgs(int argc, const char **argv)
 {
     int i;
-    gTestNames = (const char**) calloc( argc - 1, sizeof( char*) );
-    if( NULL == gTestNames )
+    gTestNames = (const char **)calloc(argc - 1, sizeof(char *));
+    if (NULL == gTestNames)
     {
-        vlog( "Failed to allocate memory for gTestNames array.\n" );
+        vlog("Failed to allocate memory for gTestNames array.\n");
         return 1;
     }
     gTestNames[0] = argv[0];
@@ -850,91 +893,64 @@ static int ParseArgs( int argc, const char **argv )
     int singleThreaded = 0;
 
     { // Extract the app name
-        strncpy( appName, argv[0], MAXPATHLEN );
+        strncpy(appName, argv[0], MAXPATHLEN);
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
         char baseName[MAXPATHLEN];
         char *base = NULL;
-        strncpy( baseName, argv[0], MAXPATHLEN );
-        base = basename( baseName );
-        if( NULL != base )
+        strncpy(baseName, argv[0], MAXPATHLEN);
+        base = basename(baseName);
+        if (NULL != base)
         {
-            strncpy( appName, base, sizeof( appName )  );
-            appName[ sizeof( appName ) -1 ] = '\0';
+            strncpy(appName, base, sizeof(appName));
+            appName[sizeof(appName) - 1] = '\0';
         }
 #endif
     }
 
-    vlog( "\n%s\t", appName );
-    for( i = 1; i < argc; i++ )
+    vlog("\n%s\t", appName);
+    for (i = 1; i < argc; i++)
     {
         const char *arg = argv[i];
-        if( NULL == arg )
-            break;
+        if (NULL == arg) break;
 
-        vlog( "\t%s", arg );
+        vlog("\t%s", arg);
         int optionFound = 0;
-        if( arg[0] == '-' )
+        if (arg[0] == '-')
         {
-            while( arg[1] != '\0' )
+            while (arg[1] != '\0')
             {
                 arg++;
                 optionFound = 1;
-                switch( *arg )
+                switch (*arg)
                 {
-                    case 'a':
-                        gReportAverageTimes ^= 1;
-                        break;
+                    case 'a': gReportAverageTimes ^= 1; break;
 
-                    case 'c':
-                        gToggleCorrectlyRoundedDivideSqrt ^= 1;
-                        break;
+                    case 'c': gToggleCorrectlyRoundedDivideSqrt ^= 1; break;
 
-                    case 'd':
-                        gHasDouble ^= 1;
-                        break;
+                    case 'd': gHasDouble ^= 1; break;
 
-                    case 'e':
-                        gFastRelaxedDerived ^= 1;
-                        break;
+                    case 'e': gFastRelaxedDerived ^= 1; break;
 
-                    case 'f':
-                        gTestFloat ^= 1;
-                        break;
+                    case 'f': gTestFloat ^= 1; break;
 
-                    case 'h':
-                        PrintUsage();
-                        return -1;
+                    case 'h': PrintUsage(); return -1;
 
-                    case 'p':
-                      PrintFunctions();
-                      return -1;
+                    case 'p': PrintFunctions(); return -1;
 
-                    case 'l':
-                        gSkipCorrectnessTesting ^= 1;
-                        break;
+                    case 'l': gSkipCorrectnessTesting ^= 1; break;
 
-                    case 'm':
-                        singleThreaded ^= 1;
-                        break;
+                    case 'm': singleThreaded ^= 1; break;
 
-                    case 'r':
-                        gTestFastRelaxed ^= 1;
-                        break;
+                    case 'r': gTestFastRelaxed ^= 1; break;
 
-                    case 's':
-                        gStopOnError ^= 1;
-                        break;
+                    case 's': gStopOnError ^= 1; break;
 
-                    case 't':
-                        gMeasureTimes ^= 1;
-                        break;
+                    case 't': gMeasureTimes ^= 1; break;
 
-                    case 'v':
-                        gVerboseBruteForce ^= 1;
-                        break;
+                    case 'v': gVerboseBruteForce ^= 1; break;
 
-                    case 'w':   // wimpy mode
+                    case 'w': // wimpy mode
                         gWimpyMode ^= 1;
                         break;
 
@@ -942,12 +958,10 @@ static int ParseArgs( int argc, const char **argv )
                         parseWimpyReductionFactor(arg, gWimpyReductionFactor);
                         break;
 
-                    case 'z':
-                        gForceFTZ ^= 1;
-                        break;
+                    case 'z': gForceFTZ ^= 1; break;
 
                     case '1':
-                        if( arg[1] == '6' )
+                        if (arg[1] == '6')
                         {
                             gMinVectorSizeIndex = 5;
                             gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
@@ -960,52 +974,52 @@ static int ParseArgs( int argc, const char **argv )
                         }
                         break;
                     case '2':
-                            gMinVectorSizeIndex = 1;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 1;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                     case '3':
-                            gMinVectorSizeIndex = 2;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 2;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                     case '4':
-                            gMinVectorSizeIndex = 3;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 3;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                     case '8':
-                            gMinVectorSizeIndex = 4;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 4;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                         break;
 
                     default:
-                        vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
+                        vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg);
                         PrintUsage();
                         return -1;
                 }
             }
         }
 
-        if( ! optionFound )
+        if (!optionFound)
         {
             char *t = NULL;
-            long number = strtol( arg, &t, 0 );
-            if( t != arg )
+            long number = strtol(arg, &t, 0);
+            if (t != arg)
             {
-                if( 0 == gStartTestNumber )
-                    gStartTestNumber = (int32_t) number;
+                if (0 == gStartTestNumber)
+                    gStartTestNumber = (int32_t)number;
                 else
-                    gEndTestNumber = gStartTestNumber + (int32_t) number;
+                    gEndTestNumber = gStartTestNumber + (int32_t)number;
             }
             else
             {
                 // Make sure this is a valid name
                 unsigned int k;
-                for (k=0; k<functionListCount; k++)
+                for (k = 0; k < functionListCount; k++)
                 {
-                    const Func *f = functionList+k;
+                    const Func *f = functionList + k;
                     if (strcmp(arg, f->name) == 0)
                     {
-                        gTestNames[ gTestNameCount ] = arg;
+                        gTestNames[gTestNameCount] = arg;
                         gTestNameCount++;
                         break;
                     }
@@ -1021,118 +1035,141 @@ static int ParseArgs( int argc, const char **argv )
     }
 
     // Check for the wimpy mode environment variable
-    if (getenv("CL_WIMPY_MODE")) {
-      vlog( "\n" );
-      vlog( "*** Detected CL_WIMPY_MODE env                          ***\n" );
-      gWimpyMode = 1;
+    if (getenv("CL_WIMPY_MODE"))
+    {
+        vlog("\n");
+        vlog("*** Detected CL_WIMPY_MODE env                          ***\n");
+        gWimpyMode = 1;
     }
 
-    vlog( "\nTest binary built %s %s\n", __DATE__, __TIME__ );
+    vlog("\nTest binary built %s %s\n", __DATE__, __TIME__);
 
     PrintArch();
 
-    if( gWimpyMode )
+    if (gWimpyMode)
     {
-        vlog( "\n" );
-        vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
-        vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
-        vlog( "*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor );
+        vlog("\n");
+        vlog("*** WARNING: Testing in Wimpy mode!                     ***\n");
+        vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n");
+        vlog("*** Wimpy Reduction Factor: %-27u ***\n\n",
+             gWimpyReductionFactor);
     }
 
-    if( singleThreaded )
-        SetThreadCount(1);
+    if (singleThreaded) SetThreadCount(1);
 
     return 0;
 }
 
 
-static void PrintFunctions ( void )
+static void PrintFunctions(void)
 {
-  vlog( "\nMath function names:\n" );
-  for( int i = 0; i < functionListCount; i++ )
-  {
-    vlog( "\t%s\n", functionList[ i ].name );
-  }
+    vlog("\nMath function names:\n");
+    for (int i = 0; i < functionListCount; i++)
+    {
+        vlog("\t%s\n", functionList[i].name);
+    }
 }
 
-static void PrintUsage( void )
-{
-    vlog( "%s [-acglstz]: <optional: math function names>\n", appName );
-    vlog( "\toptions:\n" );
-    vlog( "\t\t-a\tReport average times instead of best times\n" );
-    vlog( "\t\t-c\tToggle test fp correctly rounded divide and sqrt (Default: off)\n");
-    vlog( "\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 on)\n" );
-    vlog( "\t\t-f\tToggle float precision testing. (Default: on)\n" );
-    vlog( "\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n" );
-    vlog( "\t\t-e\tToggle test as derived implementations for fast relaxed math precision. (Default: on)\n" );
-    vlog( "\t\t-h\tPrint this message and quit\n" );
-    vlog( "\t\t-p\tPrint all math function names and quit\n" );
-    vlog( "\t\t-l\tlink check only (make sure functions are present, skip accuracy checks.)\n" );
-    vlog( "\t\t-m\tToggle run multi-threaded. (Default: on) )\n" );
-    vlog( "\t\t-s\tStop on error\n" );
-    vlog( "\t\t-t\tToggle timing  (on by default)\n" );
-    vlog( "\t\t-w\tToggle Wimpy Mode, * Not a valid test * \n");
-    vlog( "\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-10, default factor(%u)\n",gWimpyReductionFactor );
-    vlog( "\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by device capabilities by default.)\n" );
-    vlog( "\t\t-v\tToggle Verbosity (Default: off)\n ");
-    vlog( "\t\t-#\tTest only vector sizes #, e.g. \"-1\" tests scalar only, \"-16\" tests 16-wide vectors only.\n" );
-    vlog( "\n\tYou may also pass a number instead of a function name.\n" );
-    vlog( "\tThis causes the first N tests to be skipped. The tests are numbered.\n" );
-    vlog( "\tIf you pass a second number, that is the number tests to run after the first one.\n" );
-    vlog( "\tA name list may be used in conjunction with a number range. In that case,\n" );
-    vlog( "\tonly the named cases in the number range will run.\n" );
-    vlog( "\tYou may also choose to pass no arguments, in which case all tests will be run.\n" );
-    vlog( "\tYou may pass CL_DEVICE_TYPE_CPU/GPU/ACCELERATOR to select the device.\n" );
-    vlog( "\n" );
+static void PrintUsage(void)
+{
+    vlog("%s [-acglstz]: <optional: math function names>\n", appName);
+    vlog("\toptions:\n");
+    vlog("\t\t-a\tReport average times instead of best times\n");
+    vlog("\t\t-c\tToggle test fp correctly rounded divide and sqrt (Default: "
+         "off)\n");
+    vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 "
+         "on)\n");
+    vlog("\t\t-f\tToggle float precision testing. (Default: on)\n");
+    vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n");
+    vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math "
+         "precision. (Default: on)\n");
+    vlog("\t\t-h\tPrint this message and quit\n");
+    vlog("\t\t-p\tPrint all math function names and quit\n");
+    vlog("\t\t-l\tlink check only (make sure functions are present, skip "
+         "accuracy checks.)\n");
+    vlog("\t\t-m\tToggle run multi-threaded. (Default: on) )\n");
+    vlog("\t\t-s\tStop on error\n");
+    vlog("\t\t-t\tToggle timing  (on by default)\n");
+    vlog("\t\t-w\tToggle Wimpy Mode, * Not a valid test * \n");
+    vlog("\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
+         "1-10, default factor(%u)\n",
+         gWimpyReductionFactor);
+    vlog("\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by "
+         "device capabilities by default.)\n");
+    vlog("\t\t-v\tToggle Verbosity (Default: off)\n ");
+    vlog("\t\t-#\tTest only vector sizes #, e.g. \"-1\" tests scalar only, "
+         "\"-16\" tests 16-wide vectors only.\n");
+    vlog("\n\tYou may also pass a number instead of a function name.\n");
+    vlog("\tThis causes the first N tests to be skipped. The tests are "
+         "numbered.\n");
+    vlog("\tIf you pass a second number, that is the number tests to run after "
+         "the first one.\n");
+    vlog("\tA name list may be used in conjunction with a number range. In "
+         "that case,\n");
+    vlog("\tonly the named cases in the number range will run.\n");
+    vlog("\tYou may also choose to pass no arguments, in which case all tests "
+         "will be run.\n");
+    vlog("\tYou may pass CL_DEVICE_TYPE_CPU/GPU/ACCELERATOR to select the "
+         "device.\n");
+    vlog("\n");
 }
 
-static void CL_CALLBACK bruteforce_notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
+static void CL_CALLBACK bruteforce_notify_callback(const char *errinfo,
+                                                   const void *private_info,
+                                                   size_t cb, void *user_data)
 {
-    vlog( "%s  (%p, %zd, %p)\n", errinfo, private_info, cb, user_data );
+    vlog("%s  (%p, %zd, %p)\n", errinfo, private_info, cb, user_data);
 }
 
-test_status InitCL( cl_device_id device )
+test_status InitCL(cl_device_id device)
 {
     int error;
     uint32_t i;
-    size_t configSize = sizeof( gComputeDevices );
+    size_t configSize = sizeof(gComputeDevices);
     cl_device_type device_type;
 
-    error = clGetDeviceInfo( device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL );
-    if( error )
+    error = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type),
+                            &device_type, NULL);
+    if (error)
     {
-        print_error( error, "Unable to get device type" );
+        print_error(error, "Unable to get device type");
         return TEST_FAIL;
     }
 
     gDevice = device;
-    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) )
+    if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                 configSize, &gComputeDevices, NULL)))
         gComputeDevices = 1;
 
     // Check extensions
-    if(is_extension_available(gDevice, "cl_khr_fp64"))
+    if (is_extension_available(gDevice, "cl_khr_fp64"))
     {
         gHasDouble ^= 1;
-#if defined( CL_DEVICE_DOUBLE_FP_CONFIG )
-        if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(gDoubleCapabilities), &gDoubleCapabilities, NULL)))
+#if defined(CL_DEVICE_DOUBLE_FP_CONFIG)
+        if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_DOUBLE_FP_CONFIG,
+                                     sizeof(gDoubleCapabilities),
+                                     &gDoubleCapabilities, NULL)))
         {
-            vlog_error( "ERROR: Unable to get device CL_DEVICE_DOUBLE_FP_CONFIG. (%d)\n", error );
+            vlog_error("ERROR: Unable to get device "
+                       "CL_DEVICE_DOUBLE_FP_CONFIG. (%d)\n",
+                       error);
             return TEST_FAIL;
         }
 
-        if( DOUBLE_REQUIRED_FEATURES != (gDoubleCapabilities & DOUBLE_REQUIRED_FEATURES) )
+        if (DOUBLE_REQUIRED_FEATURES
+            != (gDoubleCapabilities & DOUBLE_REQUIRED_FEATURES))
         {
             std::string list;
             if (0 == (gDoubleCapabilities & CL_FP_FMA)) list += "CL_FP_FMA, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_NEAREST) )
+            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_NEAREST))
                 list += "CL_FP_ROUND_TO_NEAREST, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_ZERO) )
+            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_ZERO))
                 list += "CL_FP_ROUND_TO_ZERO, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_INF) )
+            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_INF))
                 list += "CL_FP_ROUND_TO_INF, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_INF_NAN) )
+            if (0 == (gDoubleCapabilities & CL_FP_INF_NAN))
                 list += "CL_FP_INF_NAN, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_DENORM) )
+            if (0 == (gDoubleCapabilities & CL_FP_DENORM))
                 list += "CL_FP_DENORM, ";
             vlog_error("ERROR: required double features are missing: %s\n",
                        list.c_str());
@@ -1140,100 +1177,104 @@ test_status InitCL( cl_device_id device )
             return TEST_FAIL;
         }
 #else
-        vlog_error( "FAIL: device says it supports cl_khr_fp64 but CL_DEVICE_DOUBLE_FP_CONFIG is not in the headers!\n" );
+        vlog_error("FAIL: device says it supports cl_khr_fp64 but "
+                   "CL_DEVICE_DOUBLE_FP_CONFIG is not in the headers!\n");
         return TEST_FAIL;
 #endif
     }
 
-    configSize = sizeof( gDeviceFrequency );
-    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) )
+    configSize = sizeof(gDeviceFrequency);
+    if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                                 configSize, &gDeviceFrequency, NULL)))
         gDeviceFrequency = 0;
 
-    if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(gFloatCapabilities), &gFloatCapabilities, NULL)))
+    if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG,
+                                 sizeof(gFloatCapabilities),
+                                 &gFloatCapabilities, NULL)))
     {
-        vlog_error( "ERROR: Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n", error );
+        vlog_error(
+            "ERROR: Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n",
+            error);
         return TEST_FAIL;
     }
 
-    gContext = clCreateContext( NULL, 1, &gDevice, bruteforce_notify_callback, NULL, &error );
-    if( NULL == gContext || error )
+    gContext = clCreateContext(NULL, 1, &gDevice, bruteforce_notify_callback,
+                               NULL, &error);
+    if (NULL == gContext || error)
     {
-        vlog_error( "clCreateContext failed. (%d) \n", error );
+        vlog_error("clCreateContext failed. (%d) \n", error);
         return TEST_FAIL;
     }
 
     gQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-    if( NULL == gQueue || error )
+    if (NULL == gQueue || error)
     {
-        vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+        vlog_error("clCreateCommandQueue failed. (%d)\n", error);
         return TEST_FAIL;
     }
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     // FIXME: use clProtectedArray
 #endif
-    //Allocate buffers
+    // Allocate buffers
     cl_uint min_alignment = 0;
-    error = clGetDeviceInfo (gDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), (void*)&min_alignment, NULL);
+    error = clGetDeviceInfo(gDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+                            sizeof(cl_uint), (void *)&min_alignment, NULL);
     if (CL_SUCCESS != error)
     {
-        vlog_error( "clGetDeviceInfo failed. (%d)\n", error );
+        vlog_error("clGetDeviceInfo failed. (%d)\n", error);
         return TEST_FAIL;
     }
-    min_alignment >>= 3;    // convert bits to bytes
-
-    gIn   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gIn )
-        return TEST_FAIL;
-    gIn2   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gIn2 )
-        return TEST_FAIL;
-    gIn3   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gIn3 )
-        return TEST_FAIL;
-    gOut_Ref   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gOut_Ref )
-        return TEST_FAIL;
-    gOut_Ref2   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gOut_Ref2 )
-        return TEST_FAIL;
-
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    min_alignment >>= 3; // convert bits to bytes
+
+    gIn = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gIn) return TEST_FAIL;
+    gIn2 = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gIn2) return TEST_FAIL;
+    gIn3 = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gIn3) return TEST_FAIL;
+    gOut_Ref = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gOut_Ref) return TEST_FAIL;
+    gOut_Ref2 = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gOut_Ref2) return TEST_FAIL;
+
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        gOut[i] = align_malloc( BUFFER_SIZE, min_alignment );
-        if( NULL == gOut[i] )
-            return TEST_FAIL;
-        gOut2[i] = align_malloc( BUFFER_SIZE, min_alignment );
-        if( NULL == gOut2[i] )
-            return TEST_FAIL;
+        gOut[i] = align_malloc(BUFFER_SIZE, min_alignment);
+        if (NULL == gOut[i]) return TEST_FAIL;
+        gOut2[i] = align_malloc(BUFFER_SIZE, min_alignment);
+        if (NULL == gOut2[i]) return TEST_FAIL;
     }
 
     cl_mem_flags device_flags = CL_MEM_READ_ONLY;
     // save a copy on the host device to make this go faster
-    if( CL_DEVICE_TYPE_CPU == device_type )
+    if (CL_DEVICE_TYPE_CPU == device_type)
         device_flags |= CL_MEM_USE_HOST_PTR;
-      else
-          device_flags |= CL_MEM_COPY_HOST_PTR;
+    else
+        device_flags |= CL_MEM_COPY_HOST_PTR;
 
     // setup input buffers
-    gInBuffer = clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn, &error);
-    if( gInBuffer == NULL || error )
+    gInBuffer =
+        clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn, &error);
+    if (gInBuffer == NULL || error)
     {
-        vlog_error( "clCreateBuffer1 failed for input (%d)\n", error );
+        vlog_error("clCreateBuffer1 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
-    gInBuffer2 = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gIn2, &error );
-    if( gInBuffer2 == NULL || error )
+    gInBuffer2 =
+        clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn2, &error);
+    if (gInBuffer2 == NULL || error)
     {
-        vlog_error( "clCreateArray2 failed for input (%d)\n" , error );
+        vlog_error("clCreateArray2 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
-    gInBuffer3 = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gIn3, &error );
-    if( gInBuffer3 == NULL  || error)
+    gInBuffer3 =
+        clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn3, &error);
+    if (gInBuffer3 == NULL || error)
     {
-        vlog_error( "clCreateArray3 failed for input (%d)\n", error );
+        vlog_error("clCreateArray3 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
@@ -1241,38 +1282,40 @@ test_status InitCL( cl_device_id device )
     // setup output buffers
     device_flags = CL_MEM_READ_WRITE;
     // save a copy on the host device to make this go faster
-    if( CL_DEVICE_TYPE_CPU == device_type )
+    if (CL_DEVICE_TYPE_CPU == device_type)
         device_flags |= CL_MEM_USE_HOST_PTR;
-      else
-          device_flags |= CL_MEM_COPY_HOST_PTR;
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    else
+        device_flags |= CL_MEM_COPY_HOST_PTR;
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        gOutBuffer[i] = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gOut[i], &error );
-        if( gOutBuffer[i] == NULL || error )
+        gOutBuffer[i] = clCreateBuffer(gContext, device_flags, BUFFER_SIZE,
+                                       gOut[i], &error);
+        if (gOutBuffer[i] == NULL || error)
         {
-            vlog_error( "clCreateArray failed for output (%d)\n", error  );
+            vlog_error("clCreateArray failed for output (%d)\n", error);
             return TEST_FAIL;
         }
-        gOutBuffer2[i] = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gOut2[i], &error );
-        if( gOutBuffer2[i] == NULL || error)
+        gOutBuffer2[i] = clCreateBuffer(gContext, device_flags, BUFFER_SIZE,
+                                        gOut2[i], &error);
+        if (gOutBuffer2[i] == NULL || error)
         {
-            vlog_error( "clCreateArray2 failed for output (%d)\n", error );
+            vlog_error("clCreateArray2 failed for output (%d)\n", error);
             return TEST_FAIL;
         }
     }
 
     // we are embedded, check current rounding mode
-    if( gIsEmbedded )
+    if (gIsEmbedded)
     {
         gIsInRTZMode = IsInRTZMode();
     }
 
-    //Check tininess detection
+    // Check tininess detection
     IsTininessDetectedBeforeRounding();
 
     cl_platform_id platform;
     int err = clGetPlatformIDs(1, &platform, NULL);
-    if( err )
+    if (err)
     {
         print_error(err, "clGetPlatformIDs failed");
         return TEST_FAIL;
@@ -1280,78 +1323,97 @@ test_status InitCL( cl_device_id device )
 
     char c[1024];
     static const char *no_yes[] = { "NO", "YES" };
-    vlog( "\nCompute Device info:\n" );
+    vlog("\nCompute Device info:\n");
     clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tPlatform Version: %s\n", c );
+    vlog("\tPlatform Version: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_NAME, sizeof(c), &c, NULL);
-    vlog( "\tDevice Name: %s\n", c );
+    vlog("\tDevice Name: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_VENDOR, sizeof(c), &c, NULL);
-    vlog( "\tVendor: %s\n", c );
+    vlog("\tVendor: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tDevice Version: %s\n", c );
+    vlog("\tDevice Version: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tCL C Version: %s\n", c );
+    vlog("\tCL C Version: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DRIVER_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tDriver Version: %s\n", c );
-    vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency );
-    vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & gFloatCapabilities)] );
-    vlog( "\tCorrectly rounded divide and sqrt supported for floats? %s\n", no_yes[0 != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)] );
-    if( gToggleCorrectlyRoundedDivideSqrt )
+    vlog("\tDriver Version: %s\n", c);
+    vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency);
+    vlog("\tSubnormal values supported for floats? %s\n",
+         no_yes[0 != (CL_FP_DENORM & gFloatCapabilities)]);
+    vlog("\tCorrectly rounded divide and sqrt supported for floats? %s\n",
+         no_yes[0
+                != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)]);
+    if (gToggleCorrectlyRoundedDivideSqrt)
     {
         gFloatCapabilities ^= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
-    vlog( "\tTesting with correctly rounded float divide and sqrt? %s\n", no_yes[0 != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)] );
-    vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities)] );
-    vlog( "\tTesting single precision? %s\n", no_yes[0 != gTestFloat] );
-    vlog( "\tTesting fast relaxed math? %s\n", no_yes[0 != gTestFastRelaxed] );
-    if(gTestFastRelaxed)
+    vlog("\tTesting with correctly rounded float divide and sqrt? %s\n",
+         no_yes[0
+                != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)]);
+    vlog("\tTesting with FTZ mode ON for floats? %s\n",
+         no_yes[0 != gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities)]);
+    vlog("\tTesting single precision? %s\n", no_yes[0 != gTestFloat]);
+    vlog("\tTesting fast relaxed math? %s\n", no_yes[0 != gTestFastRelaxed]);
+    if (gTestFastRelaxed)
     {
-      vlog( "\tFast relaxed math has derived implementations? %s\n", no_yes[0 != gFastRelaxedDerived] );
+        vlog("\tFast relaxed math has derived implementations? %s\n",
+             no_yes[0 != gFastRelaxedDerived]);
     }
-    vlog( "\tTesting double precision? %s\n", no_yes[0 != gHasDouble] );
-    if( sizeof( long double) == sizeof( double ) && gHasDouble )
+    vlog("\tTesting double precision? %s\n", no_yes[0 != gHasDouble]);
+    if (sizeof(long double) == sizeof(double) && gHasDouble)
     {
-        vlog( "\n\t\tWARNING: Host system long double does not have better precision than double!\n" );
-        vlog( "\t\t         All double results that do not match the reference result have their reported\n" );
-        vlog( "\t\t         error inflated by 0.5 ulps to account for the fact that this system\n" );
-        vlog( "\t\t         can not accurately represent the right result to an accuracy closer\n" );
-        vlog( "\t\t         than half an ulp. See comments in Bruteforce_Ulp_Error_Double() for more details.\n\n" );
+        vlog("\n\t\tWARNING: Host system long double does not have better "
+             "precision than double!\n");
+        vlog("\t\t         All double results that do not match the reference "
+             "result have their reported\n");
+        vlog("\t\t         error inflated by 0.5 ulps to account for the fact "
+             "that this system\n");
+        vlog("\t\t         can not accurately represent the right result to an "
+             "accuracy closer\n");
+        vlog("\t\t         than half an ulp. See comments in "
+             "Bruteforce_Ulp_Error_Double() for more details.\n\n");
     }
 
-    vlog( "\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded] );
-    if( gIsEmbedded )
-        vlog( "\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode] );
-    vlog( "\tTininess is detected before rounding? %s\n", no_yes[0 != gCheckTininessBeforeRounding] );
-    vlog( "\tWorker threads: %d\n", GetThreadCount() );
-    vlog( "\tTesting vector sizes:" );
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        vlog( "\t%d", sizeValues[i] );
+    vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]);
+    if (gIsEmbedded)
+        vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]);
+    vlog("\tTininess is detected before rounding? %s\n",
+         no_yes[0 != gCheckTininessBeforeRounding]);
+    vlog("\tWorker threads: %d\n", GetThreadCount());
+    vlog("\tTesting vector sizes:");
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+        vlog("\t%d", sizeValues[i]);
 
     vlog("\n");
     vlog("\tVerbose? %s\n", no_yes[0 != gVerboseBruteForce]);
-    vlog( "\n\n" );
+    vlog("\n\n");
 
-    // Check to see if we are using single threaded mode on other than a 1.0 device
-    if (getenv( "CL_TEST_SINGLE_THREADED" )) {
+    // Check to see if we are using single threaded mode on other than a 1.0
+    // device
+    if (getenv("CL_TEST_SINGLE_THREADED"))
+    {
 
-      char device_version[1024] = { 0 };
-      clGetDeviceInfo( gDevice, CL_DEVICE_VERSION, sizeof(device_version), device_version, NULL );
+        char device_version[1024] = { 0 };
+        clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(device_version),
+                        device_version, NULL);
 
-      if (strcmp("OpenCL 1.0 ",device_version)) {
-        vlog("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n");
-      }
+        if (strcmp("OpenCL 1.0 ", device_version))
+        {
+            vlog("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. "
+                 "Running single threaded.\n");
+        }
     }
 
     return TEST_PASS;
 }
 
-static void ReleaseCL( void )
+static void ReleaseCL(void)
 {
     uint32_t i;
     clReleaseMemObject(gInBuffer);
     clReleaseMemObject(gInBuffer2);
     clReleaseMemObject(gInBuffer3);
-    for ( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) {
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
         clReleaseMemObject(gOutBuffer[i]);
         clReleaseMemObject(gOutBuffer2[i]);
     }
@@ -1364,25 +1426,27 @@ static void ReleaseCL( void )
     align_free(gOut_Ref);
     align_free(gOut_Ref2);
 
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         align_free(gOut[i]);
         align_free(gOut2[i]);
     }
 }
 
-void _LogBuildError( cl_program p, int line, const char *file )
+void _LogBuildError(cl_program p, int line, const char *file)
 {
     char the_log[2048] = "";
 
-    vlog_error( "%s:%d: Build Log:\n", file, line );
-    if( 0 == clGetProgramBuildInfo(p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(the_log), the_log, NULL) )
-        vlog_error( "%s", the_log );
+    vlog_error("%s:%d: Build Log:\n", file, line);
+    if (0
+        == clGetProgramBuildInfo(p, gDevice, CL_PROGRAM_BUILD_LOG,
+                                 sizeof(the_log), the_log, NULL))
+        vlog_error("%s", the_log);
     else
-        vlog_error( "*** Error getting build log for program %p\n", p );
+        vlog_error("*** Error getting build log for program %p\n", p);
 }
 
-int InitILogbConstants( void )
+int InitILogbConstants(void)
 {
     int error;
     const char *kernelSource =
@@ -1408,7 +1472,9 @@ int InitILogbConstants( void )
              clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
                             &gOutBuffer[gMinVectorSizeIndex])))
     {
-        vlog_error( "Error: Unable to set kernel arg to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error );
+        vlog_error("Error: Unable to set kernel arg to get FP_ILOGB0 and "
+                   "FP_ILOGBNAN for the device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1416,14 +1482,23 @@ int InitILogbConstants( void )
     if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
                                         NULL, NULL)))
     {
-        vlog_error( "Error: Unable to execute kernel to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error );
+        vlog_error("Error: Unable to execute kernel to get FP_ILOGB0 and "
+                   "FP_ILOGBNAN for the device. Err = %d",
+                   error);
         return error;
     }
 
-    struct{ cl_int ilogb0, ilogbnan; }data;
-    if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL)))
+    struct
+    {
+        cl_int ilogb0, ilogbnan;
+    } data;
+    if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex],
+                                     CL_TRUE, 0, sizeof(data), &data, 0, NULL,
+                                     NULL)))
     {
-        vlog_error( "Error: unable to read FP_ILOGB0 and FP_ILOGBNAN from the device. Err = %d", error );
+        vlog_error("Error: unable to read FP_ILOGB0 and FP_ILOGBNAN from the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1433,7 +1508,7 @@ int InitILogbConstants( void )
     return 0;
 }
 
-int IsTininessDetectedBeforeRounding( void )
+int IsTininessDetectedBeforeRounding(void)
 {
     int error;
     const char *kernelSource =
@@ -1449,7 +1524,8 @@ int IsTininessDetectedBeforeRounding( void )
     error =
         create_single_kernel_helper(gContext, &query, &kernel, 1, &kernelSource,
                                     "IsTininessDetectedBeforeRounding");
-    if (error != CL_SUCCESS) {
+    if (error != CL_SUCCESS)
+    {
         vlog_error("Error: Unable to create kernel to detect how tininess is "
                    "detected for the device. (%d)",
                    error);
@@ -1460,7 +1536,9 @@ int IsTininessDetectedBeforeRounding( void )
              clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
                             &gOutBuffer[gMinVectorSizeIndex])))
     {
-        vlog_error( "Error: Unable to set kernel arg to detect how tininess is detected  for the device. Err = %d", error );
+        vlog_error("Error: Unable to set kernel arg to detect how tininess is "
+                   "detected  for the device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1468,14 +1546,23 @@ int IsTininessDetectedBeforeRounding( void )
     if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
                                         NULL, NULL)))
     {
-        vlog_error( "Error: Unable to execute kernel to detect how tininess is detected  for the device. Err = %d", error );
+        vlog_error("Error: Unable to execute kernel to detect how tininess is "
+                   "detected  for the device. Err = %d",
+                   error);
         return error;
     }
 
-    struct{ cl_uint f; }data;
-    if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL)))
+    struct
+    {
+        cl_uint f;
+    } data;
+    if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex],
+                                     CL_TRUE, 0, sizeof(data), &data, 0, NULL,
+                                     NULL)))
     {
-        vlog_error( "Error: unable to read result from tininess test from the device. Err = %d", error );
+        vlog_error("Error: unable to read result from tininess test from the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1491,14 +1578,14 @@ int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k,
     int error = 0;
     char options[200] = "";
 
-    if( gForceFTZ )
+    if (gForceFTZ)
     {
-      strcat(options," -cl-denorms-are-zero");
+        strcat(options, " -cl-denorms-are-zero");
     }
 
     if (relaxedMode)
     {
-      strcat(options, " -cl-fast-relaxed-math");
+        strcat(options, " -cl-fast-relaxed-math");
     }
 
     error =
@@ -1522,39 +1609,41 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
 
     if (gForceFTZ)
     {
-      strcat(options," -cl-denorms-are-zero ");
+        strcat(options, " -cl-denorms-are-zero ");
     }
 
-    if( gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT )
+    if (gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)
     {
-      strcat(options," -cl-fp32-correctly-rounded-divide-sqrt ");
+        strcat(options, " -cl-fp32-correctly-rounded-divide-sqrt ");
     }
 
     if (relaxedMode)
     {
-      strcat(options, " -cl-fast-relaxed-math");
+        strcat(options, " -cl-fast-relaxed-math");
     }
 
-    error = create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options);
-    if ( error != CL_SUCCESS )
+    error =
+        create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options);
+    if (error != CL_SUCCESS)
     {
-        vlog_error( "\t\tFAILED -- Failed to create program. (%d)\n", error );
+        vlog_error("\t\tFAILED -- Failed to create program. (%d)\n", error);
         return error;
     }
 
 
-    memset( k, 0, kernel_count * sizeof( *k) );
-    for( i = 0; i< kernel_count; i++ )
+    memset(k, 0, kernel_count * sizeof(*k));
+    for (i = 0; i < kernel_count; i++)
     {
-        k[i] = clCreateKernel( *p, name, &error );
-        if( NULL == k[i]|| error )
+        k[i] = clCreateKernel(*p, name, &error);
+        if (NULL == k[i] || error)
         {
-            char    buffer[2048] = "";
+            char buffer[2048] = "";
 
             vlog_error("\t\tFAILED -- clCreateKernel() failed: (%d)\n", error);
-            clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
+            clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG,
+                                  sizeof(buffer), buffer, NULL);
             vlog_error("Log: %s\n", buffer);
-            clReleaseProgram( *p );
+            clReleaseProgram(*p);
             return error;
         }
     }
@@ -1563,7 +1652,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
 }
 
 
-static int IsInRTZMode( void )
+static int IsInRTZMode(void)
 {
     int error;
     const char *kernelSource =
@@ -1578,7 +1667,8 @@ static int IsInRTZMode( void )
     clKernelWrapper kernel;
     error = create_single_kernel_helper(gContext, &query, &kernel, 1,
                                         &kernelSource, "GetRoundingMode");
-    if (error != CL_SUCCESS) {
+    if (error != CL_SUCCESS)
+    {
         vlog_error("Error: Unable to create kernel to detect RTZ mode for the "
                    "device. (%d)",
                    error);
@@ -1589,7 +1679,9 @@ static int IsInRTZMode( void )
              clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
                             &gOutBuffer[gMinVectorSizeIndex])))
     {
-        vlog_error( "Error: Unable to set kernel arg to detect RTZ mode for the device. Err = %d", error );
+        vlog_error("Error: Unable to set kernel arg to detect RTZ mode for the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1597,14 +1689,23 @@ static int IsInRTZMode( void )
     if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
                                         NULL, NULL)))
     {
-        vlog_error( "Error: Unable to execute kernel to detect RTZ mode for the device. Err = %d", error );
+        vlog_error("Error: Unable to execute kernel to detect RTZ mode for the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
-    struct{ cl_int isRTZ; }data;
-    if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL)))
+    struct
     {
-        vlog_error( "Error: unable to read RTZ mode data from the device. Err = %d", error );
+        cl_int isRTZ;
+    } data;
+    if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex],
+                                     CL_TRUE, 0, sizeof(data), &data, 0, NULL,
+                                     NULL)))
+    {
+        vlog_error(
+            "Error: unable to read RTZ mode data from the device. Err = %d",
+            error);
         return error;
     }
 
@@ -1613,46 +1714,54 @@ static int IsInRTZMode( void )
 
 #pragma mark -
 
-const char *sizeNames[ VECTOR_SIZE_COUNT] = { "", "2", "3", "4", "8", "16" };
-const int  sizeValues[ VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
-
-// TODO: There is another version of Ulp_Error_Double defined in test_common/harness/errorHelpers.c
-float Bruteforce_Ulp_Error_Double( double test, long double reference )
-{
-//Check for Non-power-of-two and NaN
-
-  // Note: This function presumes that someone has already tested whether the result is correctly,
-  // rounded before calling this function.  That test:
-  //
-  //    if( (float) reference == test )
-  //        return 0.0f;
-  //
-  // would ensure that cases like fabs(reference) > FLT_MAX are weeded out before we get here.
-  // Otherwise, we'll return inf ulp error here, for what are otherwise correctly rounded
-  // results.
-
-  // Deal with long double = double
-  // On most systems long double is a higher precision type than double. They provide either
-  // a 80-bit or greater floating point type, or they provide a head-tail double double format.
-  // That is sufficient to represent the accuracy of a floating point result to many more bits
-  // than double and we can calculate sub-ulp errors. This is the standard system for which this
-  // test suite is designed.
-  //
-  // On some systems double and long double are the same thing. Then we run into a problem,
-  // because our representation of the infinitely precise result (passed in as reference above)
-  // can be off by as much as a half double precision ulp itself.  In this case, we inflate the
-  // reported error by half an ulp to take this into account.  A more correct and permanent fix
-  // would be to undertake refactoring the reference code to return results in this format:
-  //
-  //    typedef struct DoubleReference
-  //    { // true value = correctlyRoundedResult + ulps * ulp(correctlyRoundedResult)        (infinitely precise)
-  //        double  correctlyRoundedResult;     // as best we can
-  //        double  ulps;                       // plus a fractional amount to account for the difference
-  //    }DoubleReference;                       //     between infinitely precise result and correctlyRoundedResult, in units of ulps.
-  //
-  // This would provide a useful higher-than-double precision format for everyone that we can use,
-  // and would solve a few problems with representing absolute errors below DBL_MIN and over DBL_MAX for systems
-  // that use a head to tail double double for long double.
+const char *sizeNames[VECTOR_SIZE_COUNT] = { "", "2", "3", "4", "8", "16" };
+const int sizeValues[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
+
+// TODO: There is another version of Ulp_Error_Double defined in
+// test_common/harness/errorHelpers.c
+float Bruteforce_Ulp_Error_Double(double test, long double reference)
+{
+    // Check for Non-power-of-two and NaN
+
+    // Note: This function presumes that someone has already tested whether the
+    // result is correctly, rounded before calling this function.  That test:
+    //
+    //    if( (float) reference == test )
+    //        return 0.0f;
+    //
+    // would ensure that cases like fabs(reference) > FLT_MAX are weeded out
+    // before we get here. Otherwise, we'll return inf ulp error here, for what
+    // are otherwise correctly rounded results.
+
+    // Deal with long double = double
+    // On most systems long double is a higher precision type than double. They
+    // provide either a 80-bit or greater floating point type, or they provide a
+    // head-tail double double format. That is sufficient to represent the
+    // accuracy of a floating point result to many more bits than double and we
+    // can calculate sub-ulp errors. This is the standard system for which this
+    // test suite is designed.
+    //
+    // On some systems double and long double are the same thing. Then we run
+    // into a problem, because our representation of the infinitely precise
+    // result (passed in as reference above) can be off by as much as a half
+    // double precision ulp itself.  In this case, we inflate the reported error
+    // by half an ulp to take this into account.  A more correct and permanent
+    // fix would be to undertake refactoring the reference code to return
+    // results in this format:
+    //
+    //    typedef struct DoubleReference
+    //    { // true value = correctlyRoundedResult + ulps *
+    //    ulp(correctlyRoundedResult)        (infinitely precise)
+    //        double  correctlyRoundedResult;     // as best we can
+    //        double  ulps;                       // plus a fractional amount to
+    //        account for the difference
+    //    }DoubleReference;                       //     between infinitely
+    //    precise result and correctlyRoundedResult, in units of ulps.
+    //
+    // This would provide a useful higher-than-double precision format for
+    // everyone that we can use, and would solve a few problems with
+    // representing absolute errors below DBL_MIN and over DBL_MAX for systems
+    // that use a head to tail double double for long double.
 
     int x;
     long double testVal = test;
@@ -1660,119 +1769,118 @@ float Bruteforce_Ulp_Error_Double( double test, long double reference )
     // First, handle special reference values
     if (isinf(reference))
     {
-    if (reference == testVal)
-        return 0.0f;
+        if (reference == testVal) return 0.0f;
 
-    return INFINITY;
+        return INFINITY;
     }
 
     if (isnan(reference))
     {
-    if (isnan(testVal))
-        return 0.0f;
+        if (isnan(testVal)) return 0.0f;
 
-    return INFINITY;
+        return INFINITY;
     }
 
-    if ( 0.0L != reference && 0.5L != frexpl(reference, &x) )
+    if (0.0L != reference && 0.5L != frexpl(reference, &x))
     { // Non-zero and Non-power of two
 
-       // allow correctly rounded results to pass through unmolested. (We might add error to it below.)
-       // There is something of a performance optimization here.
-        if( testVal == reference )
-            return 0.0f;
+        // allow correctly rounded results to pass through unmolested. (We might
+        // add error to it below.) There is something of a performance
+        // optimization here.
+        if (testVal == reference) return 0.0f;
 
         // The unbiased exponent of the ulp unit place
-        int ulp_exp = DBL_MANT_DIG - 1 - MAX( ilogbl( reference), DBL_MIN_EXP-1 );
+        int ulp_exp =
+            DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1);
 
         // Scale the exponent of the error
-        float result = (float) scalbnl( testVal - reference, ulp_exp );
+        float result = (float)scalbnl(testVal - reference, ulp_exp);
 
-        // account for rounding error in reference result on systems that do not have a higher precision floating point type (see above)
-        if( sizeof(long double) == sizeof( double ) )
-            result += copysignf( 0.5f, result);
+        // account for rounding error in reference result on systems that do not
+        // have a higher precision floating point type (see above)
+        if (sizeof(long double) == sizeof(double))
+            result += copysignf(0.5f, result);
 
         return result;
     }
 
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
-    int ulp_exp =  DBL_MANT_DIG - 1 - MAX( ilogbl( reference) - 1, DBL_MIN_EXP-1 );
+    int ulp_exp =
+        DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
 
-   // allow correctly rounded results to pass through unmolested. (We might add error to it below.)
-   // There is something of a performance optimization here too.
-    if( testVal == reference )
-        return 0.0f;
+    // allow correctly rounded results to pass through unmolested. (We might add
+    // error to it below.) There is something of a performance optimization here
+    // too.
+    if (testVal == reference) return 0.0f;
 
     // Scale the exponent of the error
-    float result = (float) scalbnl( testVal - reference, ulp_exp );
+    float result = (float)scalbnl(testVal - reference, ulp_exp);
 
-    // account for rounding error in reference result on systems that do not have a higher precision floating point type (see above)
-    if( sizeof(long double) == sizeof( double ) )
-        result += copysignf( 0.5f, result);
+    // account for rounding error in reference result on systems that do not
+    // have a higher precision floating point type (see above)
+    if (sizeof(long double) == sizeof(double))
+        result += copysignf(0.5f, result);
 
     return result;
 }
 
-float Abs_Error( float test, double reference )
+float Abs_Error(float test, double reference)
 {
-  if( isnan(test) && isnan(reference) )
-    return 0.0f;
-  return fabs((float)(reference-(double)test));
+    if (isnan(test) && isnan(reference)) return 0.0f;
+    return fabs((float)(reference - (double)test));
 }
 
-#if defined( __APPLE__ )
-    #include <mach/mach_time.h>
+#if defined(__APPLE__)
+#include <mach/mach_time.h>
 #endif
 
-uint64_t GetTime( void )
+uint64_t GetTime(void)
 {
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     return mach_absolute_time();
 #elif defined(_WIN32) && defined(_MSC_VER)
-    return  ReadTime();
+    return ReadTime();
 #else
-    //mach_absolute_time is a high precision timer with precision < 1 microsecond.
-    #warning need accurate clock here.  Times are invalid.
+// mach_absolute_time is a high precision timer with precision < 1 microsecond.
+#warning need accurate clock here.  Times are invalid.
     return 0;
 #endif
 }
 
 
-#if defined(_WIN32) && defined (_MSC_VER)
+#if defined(_WIN32) && defined(_MSC_VER)
 /* function is defined in "compat.h" */
 #else
-double SubtractTime( uint64_t endTime, uint64_t startTime )
+double SubtractTime(uint64_t endTime, uint64_t startTime)
 {
     uint64_t diff = endTime - startTime;
     static double conversion = 0.0;
 
-    if( 0.0 == conversion )
+    if (0.0 == conversion)
     {
-#if defined( __APPLE__ )
-        mach_timebase_info_data_t info = {0,0};
-        kern_return_t   err = mach_timebase_info( &info );
-        if( 0 == err )
-            conversion = 1e-9 * (double) info.numer / (double) info.denom;
+#if defined(__APPLE__)
+        mach_timebase_info_data_t info = { 0, 0 };
+        kern_return_t err = mach_timebase_info(&info);
+        if (0 == err)
+            conversion = 1e-9 * (double)info.numer / (double)info.denom;
 #else
-    // This function consumes output from GetTime() above, and converts the time to secionds.
-    #warning need accurate ticks to seconds conversion factor here. Times are invalid.
+// This function consumes output from GetTime() above, and converts the time to
+// secionds.
+#warning need accurate ticks to seconds conversion factor here. Times are invalid.
 #endif
     }
 
     // strictly speaking we should also be subtracting out timer latency here
-    return conversion * (double) diff;
+    return conversion * (double)diff;
 }
 #endif
 
-cl_uint RoundUpToNextPowerOfTwo( cl_uint x )
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
-    if( 0 == (x & (x-1)))
-        return x;
+    if (0 == (x & (x - 1))) return x;
 
-    while( x & (x-1) )
-        x &= x-1;
+    while (x & (x - 1)) x &= x - 1;
 
-    return x+x;
+    return x + x;
 }
-
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 01c99c147d..1a5a66905e 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -23,46 +23,47 @@
 
 #include "Utility.h"
 
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
-    #include <xmmintrin.h>
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
+#include <xmmintrin.h>
 #endif
-#if defined( __SSE2__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
-    #include <emmintrin.h>
+#if defined(__SSE2__)                                                          \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
+#include <emmintrin.h>
 #endif
 
 #ifndef M_PI_4
-    #define M_PI_4 (M_PI/4)
+#define M_PI_4 (M_PI / 4)
 #endif
 
-#define EVALUATE( x )       x
-#define CONCATENATE(x, y)  x ## EVALUATE(y)
+#define EVALUATE(x) x
+#define CONCATENATE(x, y) x##EVALUATE(y)
 
 #pragma STDC FP_CONTRACT OFF
 static void __log2_ep(double *hi, double *lo, double x);
 
-typedef union
-{
+typedef union {
     uint64_t i;
     double d;
-}uint64d_t;
+} uint64d_t;
 
 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
 
 #define cl_make_nan() _CL_NAN.d
 
-static double reduce1( double x );
-static double reduce1( double x )
+static double reduce1(double x);
+static double reduce1(double x)
 {
-    if( fabs(x) >= HEX_DBL( +, 1, 0, +, 53 ) )
+    if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53))
     {
-        if( fabs(x) == INFINITY )
-            return cl_make_nan();
+        if (fabs(x) == INFINITY) return cl_make_nan();
 
-        return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs
+        return 0.0; // we patch up the sign for sinPi and cosPi later, since
+                    // they need different signs
     }
 
     // Find the nearest multiple of 2
-    const double r = copysign( HEX_DBL( +, 1, 0, +, 53 ), x );
+    const double r = copysign(HEX_DBL(+, 1, 0, +, 53), x);
     double z = x + r;
     z -= r;
 
@@ -79,7 +80,8 @@ static double reduceHalf( double x )
         if( fabs(x) == INFINITY )
             return cl_make_nan();
 
-        return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs
+        return 0.0; //we patch up the sign for sinPi and cosPi later, since they
+need different signs
     }
 
     // Find the nearest multiple of 1
@@ -92,362 +94,384 @@ static double reduceHalf( double x )
 }
 */
 
-double reference_acospi( double x) {  return reference_acos( x ) / M_PI;    }
-double reference_asinpi( double x) {  return reference_asin( x ) / M_PI;    }
-double reference_atanpi( double x) {  return reference_atan( x ) / M_PI;    }
-double reference_atan2pi( double y, double x ) { return reference_atan2( y, x) / M_PI; }
-double reference_cospi( double x)
+double reference_acospi(double x) { return reference_acos(x) / M_PI; }
+double reference_asinpi(double x) { return reference_asin(x) / M_PI; }
+double reference_atanpi(double x) { return reference_atan(x) / M_PI; }
+double reference_atan2pi(double y, double x)
 {
-    if( reference_fabs(x) >= HEX_DBL( +, 1, 0, +, 52 ) )
+    return reference_atan2(y, x) / M_PI;
+}
+double reference_cospi(double x)
+{
+    if (reference_fabs(x) >= HEX_DBL(+, 1, 0, +, 52))
     {
-        if( reference_fabs(x) == INFINITY )
-            return cl_make_nan();
+        if (reference_fabs(x) == INFINITY) return cl_make_nan();
 
-        //Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53.
-        //However, when starting with single precision inputs, there will be no odd values.
+        // Note this probably fails for odd values between 0x1.0p52 and
+        // 0x1.0p53. However, when starting with single precision inputs, there
+        // will be no odd values.
 
         return 1.0;
     }
 
-    x = reduce1(x+0.5);
+    x = reduce1(x + 0.5);
 
     // reduce to [-0.5, 0.5]
-    if( x < -0.5 )
+    if (x < -0.5)
         x = -1 - x;
-    else if ( x > 0.5 )
+    else if (x > 0.5)
         x = 1 - x;
 
     // cosPi zeros are all +0
-    if( x == 0.0 )
-        return 0.0;
+    if (x == 0.0) return 0.0;
 
-    return reference_sin( x * M_PI );
+    return reference_sin(x * M_PI);
 }
 
 double reference_relaxed_cospi(double x) { return reference_cospi(x); }
 
-double reference_relaxed_divide( double x, double y ) { return (float)(((float) x ) / ( (float) y )); }
+double reference_relaxed_divide(double x, double y)
+{
+    return (float)(((float)x) / ((float)y));
+}
 
-double reference_divide( double x, double y ) { return x / y; }
+double reference_divide(double x, double y) { return x / y; }
 
 // Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0
-static inline cl_ulong  add_carry( cl_ulong a, cl_ulong b, cl_ulong *carry )
+static inline cl_ulong add_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
 {
     cl_ulong result = a + b;
     *carry = result < a;
     return result;
 }
 
-// Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise 0
-static inline cl_ulong  sub_carry( cl_ulong a, cl_ulong b, cl_ulong *carry )
+// Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise
+// 0
+static inline cl_ulong sub_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
 {
     cl_ulong result = a - b;
     *carry = result > a;
     return result;
 }
 
-static float fallback_frexpf( float x, int *iptr )
+static float fallback_frexpf(float x, int *iptr)
 {
     cl_uint u, v;
     float fu, fv;
 
-    memcpy( &u, &x, sizeof(u));
+    memcpy(&u, &x, sizeof(u));
 
-    cl_uint exponent = u &  0x7f800000U;
+    cl_uint exponent = u & 0x7f800000U;
     cl_uint mantissa = u & ~0x7f800000U;
 
     // add 1 to the exponent
     exponent += 0x00800000U;
 
-    if( (cl_int) exponent < (cl_int) 0x01000000 )
+    if ((cl_int)exponent < (cl_int)0x01000000)
     { // subnormal, NaN, Inf
         mantissa |= 0x3f000000U;
 
         v = mantissa & 0xff800000U;
         u = mantissa;
-        memcpy( &fv, &v, sizeof(v));
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fv, &v, sizeof(v));
+        memcpy(&fu, &u, sizeof(u));
 
         fu -= fv;
 
-        memcpy( &v, &fv, sizeof(v));
-        memcpy( &u, &fu, sizeof(u));
+        memcpy(&v, &fv, sizeof(v));
+        memcpy(&u, &fu, sizeof(u));
 
-        exponent = u &  0x7f800000U;
+        exponent = u & 0x7f800000U;
         mantissa = u & ~0x7f800000U;
 
-        *iptr = (exponent >> 23) + (-126 + 1 -126);
+        *iptr = (exponent >> 23) + (-126 + 1 - 126);
         u = mantissa | 0x3f000000U;
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fu, &u, sizeof(u));
         return fu;
     }
 
     *iptr = (exponent >> 23) - 127;
     u = mantissa | 0x3f000000U;
-    memcpy( &fu, &u, sizeof(u));
+    memcpy(&fu, &u, sizeof(u));
     return fu;
 }
 
-static inline int extractf( float, cl_uint * );
-static inline int extractf( float x, cl_uint *mant )
+static inline int extractf(float, cl_uint *);
+static inline int extractf(float x, cl_uint *mant)
 {
-    static float (*frexppf)(float, int*) = NULL;
+    static float (*frexppf)(float, int *) = NULL;
     int e;
 
     // verify that frexp works properly
-    if( NULL == frexppf )
+    if (NULL == frexppf)
     {
-        if( 0.5f == frexpf( HEX_FLT( +, 1, 0, -, 130 ), &e ) && e == -129 )
+        if (0.5f == frexpf(HEX_FLT(+, 1, 0, -, 130), &e) && e == -129)
             frexppf = frexpf;
         else
             frexppf = fallback_frexpf;
     }
 
-    *mant = (cl_uint) (HEX_FLT( +, 1, 0, +, 32 ) * fabsf( frexppf( x, &e )));
+    *mant = (cl_uint)(HEX_FLT(+, 1, 0, +, 32) * fabsf(frexppf(x, &e)));
     return e - 1;
 }
 
-// Shift right by shift bits. Any bits lost on the right side are bitwise OR'd together and ORd into the LSB of the result
-static inline void shift_right_sticky_64( cl_ulong *p, int shift );
-static inline void shift_right_sticky_64( cl_ulong *p, int shift )
+// Shift right by shift bits. Any bits lost on the right side are bitwise OR'd
+// together and ORd into the LSB of the result
+static inline void shift_right_sticky_64(cl_ulong *p, int shift);
+static inline void shift_right_sticky_64(cl_ulong *p, int shift)
 {
     cl_ulong sticky = 0;
     cl_ulong r = *p;
 
     // C doesn't handle shifts greater than the size of the variable dependably
-    if( shift >= 64 )
+    if (shift >= 64)
     {
         sticky |= (0 != r);
         r = 0;
     }
     else
     {
-        sticky |= (0 != (r << (64-shift)));
+        sticky |= (0 != (r << (64 - shift)));
         r >>= shift;
     }
 
     *p = r | sticky;
 }
 
-// Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd into the LSB of the result
-static inline void add64( cl_ulong *p, cl_ulong c, int *exponent );
-static inline void add64( cl_ulong *p, cl_ulong c, int *exponent )
+// Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd
+// into the LSB of the result
+static inline void add64(cl_ulong *p, cl_ulong c, int *exponent);
+static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
 {
     cl_ulong carry;
     c = add_carry(c, *p, &carry);
-    if( carry )
+    if (carry)
     {
-        carry = c & 1;                              // set aside sticky bit
-        c >>= 1;                                    // right shift to deal with overflow
-        c |= carry | 0x8000000000000000ULL;         // or in carry bit, and sticky bit. The latter is to prevent rounding from believing we are exact half way case
-        *exponent = *exponent + 1;                  // adjust exponent
+        carry = c & 1; // set aside sticky bit
+        c >>= 1; // right shift to deal with overflow
+        c |= carry
+            | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The
+                                     // latter is to prevent rounding from
+                                     // believing we are exact half way case
+        *exponent = *exponent + 1; // adjust exponent
     }
 
     *p = c;
 }
 
 // IEEE-754 round to nearest, ties to even rounding
-static float round_to_nearest_even_float( cl_ulong p, int exponent );
-static float round_to_nearest_even_float( cl_ulong p, int exponent )
+static float round_to_nearest_even_float(cl_ulong p, int exponent);
+static float round_to_nearest_even_float(cl_ulong p, int exponent)
 {
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-            return INFINITY;
+        if (r > CL_FLT_MAX) return INFINITY;
 
         return r;
     }
-    if( exponent == -150 && p > 0x8000000000000000ULL)
-        return HEX_FLT( +, 1, 0, -, 149 );
-    if( exponent <= -150 )       return 0.0f;
+    if (exponent == -150 && p > 0x8000000000000000ULL)
+        return HEX_FLT(+, 1, 0, -, 149);
+    if (exponent <= -150) return 0.0f;
 
-    //Figure out which bits go where
+    // Figure out which bits go where
     int shift = 8 + 32;
-    if( exponent < -126 )
+    if (exponent < -126)
     {
-        shift -= 126 + exponent;                    // subnormal: shift is not 52
-        exponent = -127;                            //            set exponent to 0
+        shift -= 126 + exponent; // subnormal: shift is not 52
+        exponent = -127; //            set exponent to 0
     }
     else
-        p &= 0x7fffffffffffffffULL;                 // normal: leading bit is implicit. Remove it.
+        p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                    // it.
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     // put a representation of the residual bits into hi
-    p <<= (64-shift);
+    p <<= (64 - shift);
 
-    //round to nearest, ties to even  based on the unused portion of p
-    if( p < 0x8000000000000000ULL )        return u.d;
-    if( p == 0x8000000000000000ULL )       u.u += u.u & 1U;
-    else                                   u.u++;
+    // round to nearest, ties to even  based on the unused portion of p
+    if (p < 0x8000000000000000ULL) return u.d;
+    if (p == 0x8000000000000000ULL)
+        u.u += u.u & 1U;
+    else
+        u.u++;
 
     return u.d;
 }
 
-static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent );
-static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent )
+static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent);
+static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
 {
     extern int gCheckTininessBeforeRounding;
 
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
     int shift = 8 + 32;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-        return INFINITY;
+        if (r > CL_FLT_MAX) return INFINITY;
 
         return r;
     }
 
     // Deal with FTZ for gCheckTininessBeforeRounding
-    if( exponent < (gCheckTininessBeforeRounding - 127) )
-        return 0.0f;
+    if (exponent < (gCheckTininessBeforeRounding - 127)) return 0.0f;
 
-    if( exponent == -127 ) // only happens for machines that check tininess after rounding
-        p = (p&1) | (p>>1);
+    if (exponent
+        == -127) // only happens for machines that check tininess after rounding
+        p = (p & 1) | (p >> 1);
     else
-        p &= 0x7fffffffffffffffULL;     // normal: leading bit is implicit. Remove it.
+        p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                    // it.
 
     cl_ulong q = p;
 
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     // put a representation of the residual bits into hi
-    q <<= (64-shift);
+    q <<= (64 - shift);
 
-    //round to nearest, ties to even  based on the unused portion of p
-    if( q > 0x8000000000000000ULL )
+    // round to nearest, ties to even  based on the unused portion of p
+    if (q > 0x8000000000000000ULL)
         u.u++;
-    else if( q == 0x8000000000000000ULL )
+    else if (q == 0x8000000000000000ULL)
         u.u += u.u & 1U;
 
     // Deal with FTZ for ! gCheckTininessBeforeRounding
-    if( 0 == (u.u & 0x7f800000U )  )
-        return 0.0f;
+    if (0 == (u.u & 0x7f800000U)) return 0.0f;
 
     return u.d;
 }
 
 
 // IEEE-754 round toward zero.
-static float round_toward_zero_float( cl_ulong p, int exponent );
-static float round_toward_zero_float( cl_ulong p, int exponent )
+static float round_toward_zero_float(cl_ulong p, int exponent);
+static float round_toward_zero_float(cl_ulong p, int exponent)
 {
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-            return CL_FLT_MAX;
+        if (r > CL_FLT_MAX) return CL_FLT_MAX;
 
         return r;
     }
 
-    if( exponent <= -149 )
-        return 0.0f;
+    if (exponent <= -149) return 0.0f;
 
-    //Figure out which bits go where
+    // Figure out which bits go where
     int shift = 8 + 32;
-    if( exponent < -126 )
+    if (exponent < -126)
     {
-        shift -= 126 + exponent;                    // subnormal: shift is not 52
-        exponent = -127;                            //            set exponent to 0
+        shift -= 126 + exponent; // subnormal: shift is not 52
+        exponent = -127; //            set exponent to 0
     }
     else
-        p &= 0x7fffffffffffffffULL;                 // normal: leading bit is implicit. Remove it.
+        p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                    // it.
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     return u.d;
 }
 
-static float round_toward_zero_float_ftz( cl_ulong p, int exponent );
-static float round_toward_zero_float_ftz( cl_ulong p, int exponent )
+static float round_toward_zero_float_ftz(cl_ulong p, int exponent);
+static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
 {
     extern int gCheckTininessBeforeRounding;
 
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
     int shift = 8 + 32;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-            return CL_FLT_MAX;
+        if (r > CL_FLT_MAX) return CL_FLT_MAX;
 
         return r;
     }
 
     // Deal with FTZ for gCheckTininessBeforeRounding
-    if( exponent < -126 )
-        return 0.0f;
+    if (exponent < -126) return 0.0f;
 
-    cl_ulong q = p &= 0x7fffffffffffffffULL;     // normal: leading bit is implicit. Remove it.
+    cl_ulong q = p &=
+        0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     // put a representation of the residual bits into hi
-    q <<= (64-shift);
+    q <<= (64 - shift);
 
     return u.d;
 }
 
 // Subtract two significands.
-static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC );
-static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC )
+static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC);
+static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC)
 {
     cl_ulong carry;
-    p = sub_carry( *c, p, &carry );
+    p = sub_carry(*c, p, &carry);
 
-    if( carry )
+    if (carry)
     {
         *signC ^= 0x80000000U;
         p = -p;
     }
 
     // normalize
-    if( p )
+    if (p)
     {
         int shift = 32;
         cl_ulong test = 1ULL << 32;
-        while( 0 == (p & 0x8000000000000000ULL))
+        while (0 == (p & 0x8000000000000000ULL))
         {
-            if( p < test )
+            if (p < test)
             {
                 p <<= shift;
                 *expC = *expC - shift;
@@ -460,49 +484,60 @@ static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC )
     {
         // zero result.
         *expC = -200;
-        *signC = 0;     // IEEE rules say a - a = +0 for all rounding modes except -inf
+        *signC =
+            0; // IEEE rules say a - a = +0 for all rounding modes except -inf
     }
 
     *c = p;
 }
 
 
-float reference_fma( float a, float b, float c, int shouldFlush )
+float reference_fma(float a, float b, float c, int shouldFlush)
 {
     static const cl_uint kMSB = 0x80000000U;
 
     // Make bits accessible
-    union{ cl_uint u; cl_float d; } ua; ua.d = a;
-    union{ cl_uint u; cl_float d; } ub; ub.d = b;
-    union{ cl_uint u; cl_float d; } uc; uc.d = c;
+    union {
+        cl_uint u;
+        cl_float d;
+    } ua;
+    ua.d = a;
+    union {
+        cl_uint u;
+        cl_float d;
+    } ub;
+    ub.d = b;
+    union {
+        cl_uint u;
+        cl_float d;
+    } uc;
+    uc.d = c;
 
     // deal with Nans, infinities and zeros
-    if( isnan( a ) || isnan( b ) || isnan(c)    ||
-        isinf( a ) || isinf( b ) || isinf(c)    ||
-        0 == ( ua.u & ~kMSB)                ||  // a == 0, defeat host FTZ behavior
-        0 == ( ub.u & ~kMSB)                ||  // b == 0, defeat host FTZ behavior
-        0 == ( uc.u & ~kMSB)                )   // c == 0, defeat host FTZ behavior
+    if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
+        || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
+        0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
+        0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
     {
         FPU_mode_type oldMode;
         RoundingMode oldRoundMode = kRoundToNearestEven;
-        if( isinf( c ) && !isinf(a) && !isinf(b) )
-            return (c + a) + b;
+        if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
 
-        if (gIsInRTZMode)
-            oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
 
-        memset( &oldMode, 0, sizeof( oldMode ) );
-        if( shouldFlush )
-            ForceFTZ( &oldMode );
+        memset(&oldMode, 0, sizeof(oldMode));
+        if (shouldFlush) ForceFTZ(&oldMode);
 
-        a = (float) reference_multiply( a, b );    // some risk that the compiler will insert a non-compliant fma here on some platforms.
-        a = (float) reference_add( a, c );           // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
+        a = (float)reference_multiply(
+            a, b); // some risk that the compiler will insert a non-compliant
+                   // fma here on some platforms.
+        a = (float)reference_add(
+            a,
+            c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
 
-        if( shouldFlush )
-            RestoreFPState( &oldMode );
+        if (shouldFlush) RestoreFPState(&oldMode);
 
-        if( gIsInRTZMode )
-            set_round(oldRoundMode, kfloat);
+        if (gIsInRTZMode) set_round(oldRoundMode, kfloat);
         return a;
     }
 
@@ -510,67 +545,70 @@ float reference_fma( float a, float b, float c, int shouldFlush )
     //   exponent is a standard unbiased signed integer
     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
     cl_uint mantA, mantB, mantC;
-    int expA = extractf( a, &mantA );
-    int expB = extractf( b, &mantB );
-    int expC = extractf( c, &mantC );
-    cl_uint signC = uc.u & kMSB;                // We'll need the sign bit of C later to decide if we are adding or subtracting
+    int expA = extractf(a, &mantA);
+    int expB = extractf(b, &mantB);
+    int expC = extractf(c, &mantC);
+    cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide
+                                 // if we are adding or subtracting
 
-// exact product of A and B
+    // exact product of A and B
     int exponent = expA + expB;
     cl_uint sign = (ua.u ^ ub.u) & kMSB;
-    cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB;
+    cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
 
     // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
-    //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication
-    if( 0 == (0x8000000000000000ULL & product) )
+    //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact
+    //  that we got another power of two from the multiplication
+    if (0 == (0x8000000000000000ULL & product))
         product <<= 1;
     else
-        exponent++;         // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased.
+        exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our
+                    // exponent increased.
 
-//infinite precision add
-    cl_ulong addend = (cl_ulong) mantC << 32;
-    if( exponent >= expC )
+    // infinite precision add
+    cl_ulong addend = (cl_ulong)mantC << 32;
+    if (exponent >= expC)
     {
         // Shift C relative to the product so that their exponents match
-        if( exponent > expC )
-            shift_right_sticky_64( &addend, exponent - expC );
+        if (exponent > expC) shift_right_sticky_64(&addend, exponent - expC);
 
         // Add
-        if( sign ^ signC )
-            sub64( &product, addend, &sign, &exponent );
+        if (sign ^ signC)
+            sub64(&product, addend, &sign, &exponent);
         else
-            add64( &product, addend, &exponent );
+            add64(&product, addend, &exponent);
     }
     else
     {
         // Shift the product relative to C so that their exponents match
-        shift_right_sticky_64( &product, expC - exponent );
+        shift_right_sticky_64(&product, expC - exponent);
 
         // add
-        if( sign ^ signC )
-            sub64( &addend, product, &signC, &expC );
+        if (sign ^ signC)
+            sub64(&addend, product, &signC, &expC);
         else
-            add64( &addend, product, &expC );
+            add64(&addend, product, &expC);
 
         product = addend;
         exponent = expC;
         sign = signC;
     }
 
-    // round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c.
+    // round to IEEE result -- we do not do flushing to zero here. That part is
+    // handled manually in ternary.c.
     if (gIsInRTZMode)
     {
-        if( shouldFlush )
-            ua.d = round_toward_zero_float_ftz( product, exponent);
+        if (shouldFlush)
+            ua.d = round_toward_zero_float_ftz(product, exponent);
         else
-            ua.d = round_toward_zero_float( product, exponent);
+            ua.d = round_toward_zero_float(product, exponent);
     }
     else
     {
-        if( shouldFlush )
-            ua.d = round_to_nearest_even_float_ftz( product, exponent);
+        if (shouldFlush)
+            ua.d = round_to_nearest_even_float_ftz(product, exponent);
         else
-            ua.d = round_to_nearest_even_float( product, exponent);
+            ua.d = round_to_nearest_even_float(product, exponent);
     }
 
     // Set the sign
@@ -579,35 +617,36 @@ float reference_fma( float a, float b, float c, int shouldFlush )
     return ua.d;
 }
 
-double reference_relaxed_exp10( double x)
+double reference_relaxed_exp10(double x) { return reference_exp10(x); }
+
+double reference_exp10(double x)
 {
-  return reference_exp10(x);
+    return reference_exp2(x * HEX_DBL(+, 1, a934f0979a371, +, 1));
 }
 
-double reference_exp10( double x) {   return reference_exp2( x * HEX_DBL( +, 1, a934f0979a371, +, 1 ) );    }
 
-
-int   reference_ilogb( double x )
+int reference_ilogb(double x)
 {
     extern int gDeviceILogb0, gDeviceILogbNaN;
-    union { cl_double f; cl_ulong u;} u;
+    union {
+        cl_double f;
+        cl_ulong u;
+    } u;
 
-    u.f = (float) x;
-    cl_int exponent = (cl_int) (u.u >> 52) & 0x7ff;
-    if( exponent == 0x7ff )
+    u.f = (float)x;
+    cl_int exponent = (cl_int)(u.u >> 52) & 0x7ff;
+    if (exponent == 0x7ff)
     {
-        if( u.u & 0x000fffffffffffffULL )
-            return gDeviceILogbNaN;
+        if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
 
         return CL_INT_MAX;
     }
 
-    if( exponent == 0 )
-    {   // deal with denormals
-        u.f = x * HEX_DBL( +, 1, 0, +, 64 );
-        exponent = (cl_int) (u.u >> 52) & 0x7ff;
-        if( exponent == 0 )
-            return gDeviceILogb0;
+    if (exponent == 0)
+    { // deal with denormals
+        u.f = x * HEX_DBL(+, 1, 0, +, 64);
+        exponent = (cl_int)(u.u >> 52) & 0x7ff;
+        if (exponent == 0) return gDeviceILogb0;
 
         return exponent - (1023 + 64);
     }
@@ -615,220 +654,208 @@ int   reference_ilogb( double x )
     return exponent - 1023;
 }
 
-double reference_nan( cl_uint x )
+double reference_nan(cl_uint x)
 {
-    union{ cl_uint u; cl_float f; }u;
+    union {
+        cl_uint u;
+        cl_float f;
+    } u;
     u.u = x | 0x7fc00000U;
-    return (double) u.f;
+    return (double)u.f;
 }
 
-double reference_maxmag( double x, double y )
+double reference_maxmag(double x, double y)
 {
     double fabsx = fabs(x);
     double fabsy = fabs(y);
 
-    if( fabsx < fabsy )
-        return y;
+    if (fabsx < fabsy) return y;
 
-    if( fabsy < fabsx )
-        return x;
+    if (fabsy < fabsx) return x;
 
-    return reference_fmax( x, y );
+    return reference_fmax(x, y);
 }
 
-double reference_minmag( double x, double y )
+double reference_minmag(double x, double y)
 {
     double fabsx = fabs(x);
     double fabsy = fabs(y);
 
-    if( fabsx > fabsy )
-        return y;
+    if (fabsx > fabsy) return y;
 
-    if( fabsy > fabsx )
-        return x;
+    if (fabsy > fabsx) return x;
 
-    return reference_fmin( x, y );
+    return reference_fmin(x, y);
 }
 
-//double my_nextafter( double x, double y ){  return (double) nextafterf( (float) x, (float) y ); }
+// double my_nextafter( double x, double y ){  return (double) nextafterf(
+// (float) x, (float) y ); }
 
-double reference_relaxed_mad( double a, double b, double c)
+double reference_relaxed_mad(double a, double b, double c)
 {
-  return ((float) a )* ((float) b) + (float) c;
+    return ((float)a) * ((float)b) + (float)c;
 }
 
-double reference_mad( double a, double b, double c )
-{
-    return a * b + c;
-}
+double reference_mad(double a, double b, double c) { return a * b + c; }
 
-double reference_recip( double x) {   return 1.0 / x; }
-double reference_rootn( double x, int i )
+double reference_recip(double x) { return 1.0 / x; }
+double reference_rootn(double x, int i)
 {
 
-    //rootn ( x, 0 )  returns a NaN.
-    if( 0 == i )
-        return cl_make_nan();
+    // rootn ( x, 0 )  returns a NaN.
+    if (0 == i) return cl_make_nan();
 
-    //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-    if( x < 0 && 0 == (i&1) )
-        return cl_make_nan();
+    // rootn ( x, n )  returns a NaN for x < 0 and n is even.
+    if (x < 0 && 0 == (i & 1)) return cl_make_nan();
 
-    if( x == 0.0 )
+    if (x == 0.0)
     {
-        switch( i & 0x80000001 )
+        switch (i & 0x80000001)
         {
-            //rootn ( +-0,  n ) is +0 for even n > 0.
-            case 0:
-                return 0.0f;
+            // rootn ( +-0,  n ) is +0 for even n > 0.
+            case 0: return 0.0f;
 
-            //rootn ( +-0,  n ) is +-0 for odd n > 0.
-            case 1:
-                return x;
+            // rootn ( +-0,  n ) is +-0 for odd n > 0.
+            case 1: return x;
 
-            //rootn ( +-0,  n ) is +inf for even n < 0.
-            case 0x80000000:
-                return INFINITY;
+            // rootn ( +-0,  n ) is +inf for even n < 0.
+            case 0x80000000: return INFINITY;
 
-            //rootn ( +-0,  n ) is +-inf for odd n < 0.
-            case 0x80000001:
-                return copysign(INFINITY, x);
+            // rootn ( +-0,  n ) is +-inf for odd n < 0.
+            case 0x80000001: return copysign(INFINITY, x);
         }
     }
 
     double sign = x;
     x = reference_fabs(x);
-    x = reference_exp2( reference_log2(x) / (double) i );
-    return reference_copysignd( x, sign );
+    x = reference_exp2(reference_log2(x) / (double)i);
+    return reference_copysignd(x, sign);
 }
 
-double reference_rsqrt( double x) {   return 1.0 / reference_sqrt(x);   }
-//double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); }
-double reference_sinpi( double x)
+double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); }
+// double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); }
+double reference_sinpi(double x)
 {
     double r = reduce1(x);
 
     // reduce to [-0.5, 0.5]
-    if( r < -0.5 )
+    if (r < -0.5)
         r = -1 - r;
-    else if ( r > 0.5 )
+    else if (r > 0.5)
         r = 1 - r;
 
     // sinPi zeros have the same sign as x
-    if( r == 0.0 )
-        return reference_copysignd(0.0, x);
+    if (r == 0.0) return reference_copysignd(0.0, x);
 
-    return reference_sin( r * M_PI );
+    return reference_sin(r * M_PI);
 }
 
 double reference_relaxed_sinpi(double x) { return reference_sinpi(x); }
 
-double reference_tanpi( double x)
+double reference_tanpi(double x)
 {
     // set aside the sign  (allows us to preserve sign of -0)
-    double sign = reference_copysignd( 1.0, x);
+    double sign = reference_copysignd(1.0, x);
     double z = reference_fabs(x);
 
     // if big and even  -- caution: only works if x only has single precision
-    if( z >= HEX_DBL( +, 1, 0, +, 24 ) )
+    if (z >= HEX_DBL(+, 1, 0, +, 24))
     {
-        if( z == INFINITY )
-            return x - x;       // nan
+        if (z == INFINITY) return x - x; // nan
 
-        return reference_copysignd( 0.0, x);   // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
+        return reference_copysignd(
+            0.0, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
     }
 
     // reduce to the range [ -0.5, 0.5 ]
-    double nearest = reference_rint( z );     // round to nearest even places n + 0.5 values in the right place for us
-    int i = (int) nearest;          // test above against 0x1.0p24 avoids overflow here
+    double nearest = reference_rint(z); // round to nearest even places n + 0.5
+                                        // values in the right place for us
+    int i = (int)nearest; // test above against 0x1.0p24 avoids overflow here
     z -= nearest;
 
-    //correction for odd integer x for the right sign of zero
-    if( (i&1) && z == 0.0 )
-        sign = -sign;
+    // correction for odd integer x for the right sign of zero
+    if ((i & 1) && z == 0.0) sign = -sign;
 
     // track changes to the sign
-    sign *= reference_copysignd(1.0, z);       // really should just be an xor
-    z = reference_fabs(z);                    // remove the sign again
+    sign *= reference_copysignd(1.0, z); // really should just be an xor
+    z = reference_fabs(z); // remove the sign again
 
     // reduce once more
-    // If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly
-    if( z > 0.25 )
+    // If we don't do this, rounding error in z * M_PI will cause us not to
+    // return infinities properly
+    if (z > 0.25)
     {
         z = 0.5 - z;
-        return sign / reference_tan( z * M_PI );      // use system tan to get the right result
+        return sign
+            / reference_tan(z * M_PI); // use system tan to get the right result
     }
 
     //
-    return sign * reference_tan( z * M_PI );          // use system tan to get the right result
+    return sign
+        * reference_tan(z * M_PI); // use system tan to get the right result
 }
 
-double reference_pown( double x, int i) { return reference_pow( x, (double) i ); }
-double reference_powr( double x, double y )
+double reference_pown(double x, int i) { return reference_pow(x, (double)i); }
+double reference_powr(double x, double y)
 {
-    //powr ( x, y ) returns NaN for x < 0.
-    if( x < 0.0 )
-        return cl_make_nan();
+    // powr ( x, y ) returns NaN for x < 0.
+    if (x < 0.0) return cl_make_nan();
 
-    //powr ( x, NaN ) returns the NaN for x >= 0.
-    //powr ( NaN, y ) returns the NaN.
-    if( isnan(x) || isnan(y) )
-        return x + y;       // Note: behavior different here than for pow(1,NaN), pow(NaN, 0)
+    // powr ( x, NaN ) returns the NaN for x >= 0.
+    // powr ( NaN, y ) returns the NaN.
+    if (isnan(x) || isnan(y))
+        return x + y; // Note: behavior different here than for pow(1,NaN),
+                      // pow(NaN, 0)
 
-    if( x == 1.0 )
+    if (x == 1.0)
     {
-        //powr ( +1, +-inf ) returns NaN.
-        if( reference_fabs(y) == INFINITY )
-            return cl_make_nan();
+        // powr ( +1, +-inf ) returns NaN.
+        if (reference_fabs(y) == INFINITY) return cl_make_nan();
 
-        //powr ( +1, y ) is 1 for finite y.    (NaN handled above)
+        // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
         return 1.0;
     }
 
-    if( y == 0.0 )
+    if (y == 0.0)
     {
-        //powr ( +inf, +-0 ) returns NaN.
-        //powr ( +-0, +-0 ) returns NaN.
-        if( x == 0.0 || x == INFINITY )
-            return cl_make_nan();
+        // powr ( +inf, +-0 ) returns NaN.
+        // powr ( +-0, +-0 ) returns NaN.
+        if (x == 0.0 || x == INFINITY) return cl_make_nan();
 
-        //powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already handled above)
+        // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
+        // handled above)
         return 1.0;
     }
 
-    if( x == 0.0 )
+    if (x == 0.0)
     {
-        //powr ( +-0, -inf) is +inf.
-        //powr ( +-0, y ) is +inf for finite y < 0.
-        if( y < 0.0 )
-            return INFINITY;
+        // powr ( +-0, -inf) is +inf.
+        // powr ( +-0, y ) is +inf for finite y < 0.
+        if (y < 0.0) return INFINITY;
 
-        //powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
+        // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
         return 0.0;
     }
 
     // x = +inf
-    if( isinf(x) )
+    if (isinf(x))
     {
-        if( y < 0 )
-            return 0;
+        if (y < 0) return 0;
         return INFINITY;
     }
 
     double fabsx = reference_fabs(x);
     double fabsy = reference_fabs(y);
 
-    //y = +-inf cases
-    if( isinf(fabsy) )
+    // y = +-inf cases
+    if (isinf(fabsy))
     {
-        if( y < 0 )
+        if (y < 0)
         {
-            if( fabsx < 1 )
-                return INFINITY;
+            if (fabsx < 1) return INFINITY;
             return 0;
         }
-        if( fabsx < 1 )
-            return 0;
+        if (fabsx < 1) return 0;
         return INFINITY;
     }
 
@@ -840,169 +867,212 @@ double reference_powr( double x, double y )
     return result;
 }
 
-double reference_fract( double x, double *ip )
+double reference_fract(double x, double *ip)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *ip = cl_make_nan();
         return cl_make_nan();
     }
 
     float i;
-    float f = modff((float) x, &i );
-    if( f < 0.0 )
+    float f = modff((float)x, &i);
+    if (f < 0.0)
     {
         f = 1.0f + f;
         i -= 1.0f;
-        if( f == 1.0f )
-            f = HEX_FLT( +, 1, fffffe, -, 1 );
+        if (f == 1.0f) f = HEX_FLT(+, 1, fffffe, -, 1);
     }
     *ip = i;
     return f;
 }
 
 
-//double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); }
-double reference_add( double x, double y )
+// double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); }
+double reference_add(double x, double y)
 {
-    volatile float a = (float) x;
-    volatile float b = (float) y;
+    volatile float a = (float)x;
+    volatile float b = (float)y;
 
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
     // defeat x87
-    __m128 va = _mm_set_ss( (float) a );
-    __m128 vb = _mm_set_ss( (float) b );
-    va = _mm_add_ss( va, vb );
-    _mm_store_ss( (float*) &a, va );
+    __m128 va = _mm_set_ss((float)a);
+    __m128 vb = _mm_set_ss((float)b);
+    va = _mm_add_ss(va, vb);
+    _mm_store_ss((float *)&a, va);
 #elif defined(__PPC__)
-    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero.
-    // As such, the reference add with FTZ must be emulated in sw.
-    if (fpu_control & _FPU_MASK_NI) {
-      union{ cl_uint u; cl_float d; } ua; ua.d = a;
-      union{ cl_uint u; cl_float d; } ub; ub.d = b;
-      cl_uint mantA, mantB;
-      cl_ulong addendA, addendB, sum;
-      int expA = extractf( a, &mantA );
-      int expB = extractf( b, &mantB );
-      cl_uint signA = ua.u & 0x80000000U;
-      cl_uint signB = ub.u & 0x80000000U;
-
-      // Force matching exponents if an operand is 0
-      if (a == 0.0f) {
-    expA = expB;
-      } else if (b == 0.0f) {
-    expB = expA;
-      }
-
-      addendA = (cl_ulong)mantA << 32;
-      addendB = (cl_ulong)mantB << 32;
-
-      if (expA >= expB) {
-        // Shift B relative to the A so that their exponents match
-        if( expA > expB )
-      shift_right_sticky_64( &addendB, expA - expB );
+    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
+    // denorm's to zero. As such, the reference add with FTZ must be emulated in
+    // sw.
+    if (fpu_control & _FPU_MASK_NI)
+    {
+        union {
+            cl_uint u;
+            cl_float d;
+        } ua;
+        ua.d = a;
+        union {
+            cl_uint u;
+            cl_float d;
+        } ub;
+        ub.d = b;
+        cl_uint mantA, mantB;
+        cl_ulong addendA, addendB, sum;
+        int expA = extractf(a, &mantA);
+        int expB = extractf(b, &mantB);
+        cl_uint signA = ua.u & 0x80000000U;
+        cl_uint signB = ub.u & 0x80000000U;
+
+        // Force matching exponents if an operand is 0
+        if (a == 0.0f)
+        {
+            expA = expB;
+        }
+        else if (b == 0.0f)
+        {
+            expB = expA;
+        }
 
-        // add
-        if( signA ^ signB )
-      sub64( &addendA, addendB, &signA, &expA );
+        addendA = (cl_ulong)mantA << 32;
+        addendB = (cl_ulong)mantB << 32;
+
+        if (expA >= expB)
+        {
+            // Shift B relative to the A so that their exponents match
+            if (expA > expB) shift_right_sticky_64(&addendB, expA - expB);
+
+            // add
+            if (signA ^ signB)
+                sub64(&addendA, addendB, &signA, &expA);
+            else
+                add64(&addendA, addendB, &expA);
+        }
         else
-      add64( &addendA, addendB, &expA );
-      } else  {
-        // Shift the A relative to B so that their exponents match
-        shift_right_sticky_64( &addendA, expB - expA );
+        {
+            // Shift the A relative to B so that their exponents match
+            shift_right_sticky_64(&addendA, expB - expA);
 
-        // add
-        if( signA ^ signB )
-      sub64( &addendB, addendA, &signB, &expB );
+            // add
+            if (signA ^ signB)
+                sub64(&addendB, addendA, &signB, &expB);
+            else
+                add64(&addendB, addendA, &expB);
+
+            addendA = addendB;
+            expA = expB;
+            signA = signB;
+        }
+
+        // round to IEEE result
+        if (gIsInRTZMode)
+        {
+            ua.d = round_toward_zero_float_ftz(addendA, expA);
+        }
         else
-      add64( &addendB, addendA, &expB );
-
-        addendA = addendB;
-        expA = expB;
-        signA = signB;
-      }
-
-      // round to IEEE result
-      if (gIsInRTZMode)    {
-    ua.d = round_toward_zero_float_ftz( addendA, expA );
-      } else {
-    ua.d = round_to_nearest_even_float_ftz( addendA, expA );
-      }
-      // Set the sign
-      ua.u |= signA;
-      a = ua.d;
-    } else {
-      a += b;
+        {
+            ua.d = round_to_nearest_even_float_ftz(addendA, expA);
+        }
+        // Set the sign
+        ua.u |= signA;
+        a = ua.d;
+    }
+    else
+    {
+        a += b;
     }
 #else
     a += b;
 #endif
-    return (double) a;
- }
+    return (double)a;
+}
 
 
-double reference_subtract( double x, double y )
+double reference_subtract(double x, double y)
 {
-    volatile float a = (float) x;
-    volatile float b = (float) y;
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
+    volatile float a = (float)x;
+    volatile float b = (float)y;
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
     // defeat x87
-    __m128 va = _mm_set_ss( (float) a );
-    __m128 vb = _mm_set_ss( (float) b );
-    va = _mm_sub_ss( va, vb );
-    _mm_store_ss( (float*) &a, va );
+    __m128 va = _mm_set_ss((float)a);
+    __m128 vb = _mm_set_ss((float)b);
+    va = _mm_sub_ss(va, vb);
+    _mm_store_ss((float *)&a, va);
 #else
     a -= b;
 #endif
     return a;
 }
 
-//double reference_divide( double x, double y ){ return (float) x / (float) y; }
-double reference_multiply( double x, double y)
+// double reference_divide( double x, double y ){ return (float) x / (float) y;
+// }
+double reference_multiply(double x, double y)
 {
-    volatile float a = (float) x;
-    volatile float b = (float) y;
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
+    volatile float a = (float)x;
+    volatile float b = (float)y;
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
     // defeat x87
-    __m128 va = _mm_set_ss( (float) a );
-    __m128 vb = _mm_set_ss( (float) b );
-    va = _mm_mul_ss( va, vb );
-    _mm_store_ss( (float*) &a, va );
+    __m128 va = _mm_set_ss((float)a);
+    __m128 vb = _mm_set_ss((float)b);
+    va = _mm_mul_ss(va, vb);
+    _mm_store_ss((float *)&a, va);
 #elif defined(__PPC__)
-    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero.
-    // As such, the reference multiply with FTZ must be emulated in sw.
-    if (fpu_control & _FPU_MASK_NI) {
-      // extract exponent and mantissa
-      //   exponent is a standard unbiased signed integer
-      //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
-      union{ cl_uint u; cl_float d; } ua; ua.d = a;
-      union{ cl_uint u; cl_float d; } ub; ub.d = b;
-      cl_uint mantA, mantB;
-      int expA = extractf( a, &mantA );
-      int expB = extractf( b, &mantB );
-
-      // exact product of A and B
-      int exponent = expA + expB;
-      cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
-      cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB;
-
-      // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
-      //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication
-      if( 0 == (0x8000000000000000ULL & product) )
-        product <<= 1;
-      else
-        exponent++;         // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased.
-
-      // round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c.
-      if (gIsInRTZMode)    {
-    ua.d = round_toward_zero_float_ftz( product, exponent);
-      } else {
-    ua.d = round_to_nearest_even_float_ftz( product, exponent);
-      }
-      // Set the sign
-      ua.u |= sign;
-      a = ua.d;
-    } else {
-      a *= b;
+    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
+    // denorm's to zero. As such, the reference multiply with FTZ must be
+    // emulated in sw.
+    if (fpu_control & _FPU_MASK_NI)
+    {
+        // extract exponent and mantissa
+        //   exponent is a standard unbiased signed integer
+        //   mantissa is a cl_uint, with leading non-zero bit positioned at the
+        //   MSB
+        union {
+            cl_uint u;
+            cl_float d;
+        } ua;
+        ua.d = a;
+        union {
+            cl_uint u;
+            cl_float d;
+        } ub;
+        ub.d = b;
+        cl_uint mantA, mantB;
+        int expA = extractf(a, &mantA);
+        int expB = extractf(b, &mantB);
+
+        // exact product of A and B
+        int exponent = expA + expB;
+        cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
+        cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
+
+        // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
+        //  The MSB might not be set. If so, fix that. Otherwise, reflect the
+        //  fact that we got another power of two from the multiplication
+        if (0 == (0x8000000000000000ULL & product))
+            product <<= 1;
+        else
+            exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then
+                        // our exponent increased.
+
+        // round to IEEE result -- we do not do flushing to zero here. That part
+        // is handled manually in ternary.c.
+        if (gIsInRTZMode)
+        {
+            ua.d = round_toward_zero_float_ftz(product, exponent);
+        }
+        else
+        {
+            ua.d = round_to_nearest_even_float_ftz(product, exponent);
+        }
+        // Set the sign
+        ua.u |= sign;
+        a = ua.d;
+    }
+    else
+    {
+        a *= b;
     }
 #else
     a *= b;
@@ -1022,7 +1092,7 @@ double reference_multiply( double x, double y)
 
     return (double) remquof( (float) x, (float) y, iptr );
 }*/
-double reference_lgamma_r( double x, int *signp )
+double reference_lgamma_r(double x, int *signp)
 {
     // This is not currently tested
     *signp = 0;
@@ -1030,81 +1100,93 @@ double reference_lgamma_r( double x, int *signp )
 }
 
 
-int reference_isequal( double x, double y ){ return x == y; }
-int reference_isfinite( double x ){ return 0 != isfinite(x); }
-int reference_isgreater( double x, double y ){ return x > y; }
-int reference_isgreaterequal( double x, double y ){ return x >= y; }
-int reference_isinf( double x ){ return 0 != isinf(x); }
-int reference_isless( double x, double y ){ return x < y; }
-int reference_islessequal( double x, double y ){ return x <= y; }
-int reference_islessgreater( double x, double y ){  return 0 != islessgreater( x, y ); }
-int reference_isnan( double x ){ return 0 != isnan( x ); }
-int reference_isnormal( double x ){ return 0 != isnormal( (float) x ); }
-int reference_isnotequal( double x, double y ){ return x != y; }
-int reference_isordered( double x, double y){ return x == x && y == y; }
-int reference_isunordered( double x, double y ){ return isnan(x) || isnan( y ); }
-int reference_signbit( float x ){ return 0 != signbit( x ); }
+int reference_isequal(double x, double y) { return x == y; }
+int reference_isfinite(double x) { return 0 != isfinite(x); }
+int reference_isgreater(double x, double y) { return x > y; }
+int reference_isgreaterequal(double x, double y) { return x >= y; }
+int reference_isinf(double x) { return 0 != isinf(x); }
+int reference_isless(double x, double y) { return x < y; }
+int reference_islessequal(double x, double y) { return x <= y; }
+int reference_islessgreater(double x, double y)
+{
+    return 0 != islessgreater(x, y);
+}
+int reference_isnan(double x) { return 0 != isnan(x); }
+int reference_isnormal(double x) { return 0 != isnormal((float)x); }
+int reference_isnotequal(double x, double y) { return x != y; }
+int reference_isordered(double x, double y) { return x == x && y == y; }
+int reference_isunordered(double x, double y) { return isnan(x) || isnan(y); }
+int reference_signbit(float x) { return 0 != signbit(x); }
 
 #if 1 // defined( _MSC_VER )
 
-//Missing functions for win32
+// Missing functions for win32
 
 
-float reference_copysign( float x, float y )
+float reference_copysign(float x, float y)
 {
-    union { float f; cl_uint u;} ux, uy;
-    ux.f = x; uy.f = y;
+    union {
+        float f;
+        cl_uint u;
+    } ux, uy;
+    ux.f = x;
+    uy.f = y;
     ux.u &= 0x7fffffffU;
     ux.u |= uy.u & 0x80000000U;
     return ux.f;
 }
 
 
-double reference_copysignd( double x, double y )
+double reference_copysignd(double x, double y)
 {
-    union { double f; cl_ulong u;} ux, uy;
-    ux.f = x; uy.f = y;
+    union {
+        double f;
+        cl_ulong u;
+    } ux, uy;
+    ux.f = x;
+    uy.f = y;
     ux.u &= 0x7fffffffffffffffULL;
     ux.u |= uy.u & 0x8000000000000000ULL;
     return ux.f;
 }
 
 
-double reference_round( double x )
+double reference_round(double x)
 {
     double absx = reference_fabs(x);
-    if( absx < 0.5 )
-        return reference_copysignd( 0.0, x );
+    if (absx < 0.5) return reference_copysignd(0.0, x);
 
-    if( absx < HEX_DBL( +, 1, 0, +, 53 ) )
-        x = reference_trunc( x + reference_copysignd( 0.5, x ) );
+    if (absx < HEX_DBL(+, 1, 0, +, 53))
+        x = reference_trunc(x + reference_copysignd(0.5, x));
 
     return x;
 }
 
-double reference_trunc( double x )
+double reference_trunc(double x)
 {
-    if( fabs(x) < HEX_DBL( +, 1, 0, +, 53 ) )
+    if (fabs(x) < HEX_DBL(+, 1, 0, +, 53))
     {
-        cl_long l = (cl_long) x;
+        cl_long l = (cl_long)x;
 
-        return reference_copysignd( (double) l, x );
+        return reference_copysignd((double)l, x);
     }
 
     return x;
 }
 
 #ifndef FP_ILOGB0
-    #define FP_ILOGB0   INT_MIN
+#define FP_ILOGB0 INT_MIN
 #endif
 
 #ifndef FP_ILOGBNAN
-    #define FP_ILOGBNAN   INT_MAX
+#define FP_ILOGBNAN INT_MAX
 #endif
 
 
-
-double reference_cbrt(double x){ return reference_copysignd( reference_pow( reference_fabs(x), 1.0/3.0 ), x ); }
+double reference_cbrt(double x)
+{
+    return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x);
+}
 
 /*
 double reference_scalbn(double x, int i)
@@ -1122,174 +1204,188 @@ double reference_scalbn(double x, int i)
 }
 */
 
-double reference_rint( double x )
+double reference_rint(double x)
 {
-    if( reference_fabs(x) < HEX_DBL( +, 1, 0, +, 52 )  )
+    if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52))
     {
-        double magic = reference_copysignd( HEX_DBL( +, 1, 0, +, 52 ), x );
+        double magic = reference_copysignd(HEX_DBL(+, 1, 0, +, 52), x);
         double rounded = (x + magic) - magic;
-        x = reference_copysignd( rounded, x );
+        x = reference_copysignd(rounded, x);
     }
 
     return x;
 }
 
-double reference_acosh( double x )
+double reference_acosh(double x)
 { // not full precision. Sufficient precision to cover float
-    if( isnan(x) )
-        return x + x;
+    if (isnan(x)) return x + x;
 
-    if( x < 1.0 )
-        return cl_make_nan();
+    if (x < 1.0) return cl_make_nan();
 
-    return reference_log( x + reference_sqrt(x + 1) * reference_sqrt(x-1) );
+    return reference_log(x + reference_sqrt(x + 1) * reference_sqrt(x - 1));
 }
 
-double reference_asinh( double x )
+double reference_asinh(double x)
 {
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if( isnan(x) || isinf(x) )
-        return x + x;
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (isnan(x) || isinf(x)) return x + x;
 
     double absx = reference_fabs(x);
-    if( absx < HEX_DBL( +, 1, 0, -, 28 ) )
-        return x;
+    if (absx < HEX_DBL(+, 1, 0, -, 28)) return x;
 
     double sign = reference_copysignd(1.0, x);
 
-    if( absx > HEX_DBL( +, 1, 0, +, 28 ) )
-        return sign * (reference_log( absx ) + 0.693147180559945309417232121458176568);    // log(2)
+    if (absx > HEX_DBL(+, 1, 0, +, 28))
+        return sign
+            * (reference_log(absx)
+               + 0.693147180559945309417232121458176568); // log(2)
 
-    if( absx > 2.0 )
-        return sign * reference_log( 2.0 * absx + 1.0 / (reference_sqrt( x * x + 1.0 ) + absx));
+    if (absx > 2.0)
+        return sign
+            * reference_log(2.0 * absx
+                            + 1.0 / (reference_sqrt(x * x + 1.0) + absx));
 
-    return sign * reference_log1p( absx + x*x / (1.0 + reference_sqrt(1.0 + x*x)));
+    return sign
+        * reference_log1p(absx + x * x / (1.0 + reference_sqrt(1.0 + x * x)));
 }
 
 
-double reference_atanh( double x )
+double reference_atanh(double x)
 {
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if( isnan(x)  )
-        return x + x;
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (isnan(x)) return x + x;
 
-    double signed_half = reference_copysignd( 0.5, x );
+    double signed_half = reference_copysignd(0.5, x);
     x = reference_fabs(x);
-    if( x > 1.0 )
-        return cl_make_nan();
+    if (x > 1.0) return cl_make_nan();
 
-    if( x < 0.5 )
-        return signed_half * reference_log1p( 2.0 * ( x + x*x / (1-x) ) );
+    if (x < 0.5)
+        return signed_half * reference_log1p(2.0 * (x + x * x / (1 - x)));
 
-    return signed_half * reference_log1p(2.0 * x / (1-x));
+    return signed_half * reference_log1p(2.0 * x / (1 - x));
 }
 
 double reference_relaxed_atan(double x) { return reference_atan(x); }
 
-double reference_relaxed_exp2( double x )
-{
-  return reference_exp2(x);
-}
+double reference_relaxed_exp2(double x) { return reference_exp2(x); }
 
-double reference_exp2( double x )
-{ // Note: only suitable for verifying single precision. Doesn't have range of a full double exp2 implementation.
-    if( x == 0.0 )
-        return 1.0;
+double reference_exp2(double x)
+{ // Note: only suitable for verifying single precision. Doesn't have range of a
+  // full double exp2 implementation.
+    if (x == 0.0) return 1.0;
 
     // separate x into fractional and integer parts
-    double i = reference_rint( x );        // round to nearest integer
+    double i = reference_rint(x); // round to nearest integer
 
-    if( i < -150 )
-        return 0.0;
+    if (i < -150) return 0.0;
 
-    if( i > 129 )
-        return INFINITY;
+    if (i > 129) return INFINITY;
 
-    double f = x - i;            // -0.5 <= f <= 0.5
+    double f = x - i; // -0.5 <= f <= 0.5
 
     // find exp2(f)
     // calculate as p(f) = (exp2(f)-1)/f
     //              exp2(f) = f * p(f) + 1
     // p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50
 
-    double p = 0.693147180560184539289 +
-               (0.240226506955902863183 +
-               (0.055504108656833424373 +
-               (0.009618129212846484796 +
-               (0.001333355902958566035 +
-               (0.000154034191902497930 +
-               (0.000015252317761038105 +
-               (0.000001326283129417092 + 0.000000102593187638680 * f)*f)*f)*f)*f)*f)*f)*f;
+    double p = 0.693147180560184539289
+        + (0.240226506955902863183
+           + (0.055504108656833424373
+              + (0.009618129212846484796
+                 + (0.001333355902958566035
+                    + (0.000154034191902497930
+                       + (0.000015252317761038105
+                          + (0.000001326283129417092
+                             + 0.000000102593187638680 * f)
+                              * f)
+                           * f)
+                        * f)
+                     * f)
+                  * f)
+               * f)
+            * f;
     f *= p;
     f += 1.0;
 
     // scale by 2 ** i
-    union{ cl_ulong u; double d; } u;
-    int exponent = (int) i + 1023;
-    u.u = (cl_ulong) exponent << 52;
+    union {
+        cl_ulong u;
+        double d;
+    } u;
+    int exponent = (int)i + 1023;
+    u.u = (cl_ulong)exponent << 52;
 
     return f * u.d;
 }
 
 
-double reference_expm1( double x )
-{ // Note: only suitable for verifying single precision. Doesn't have range of a full double expm1 implementation. It is only accurate to 47 bits or less.
+double reference_expm1(double x)
+{ // Note: only suitable for verifying single precision. Doesn't have range of a
+  // full double expm1 implementation. It is only accurate to 47 bits or less.
 
     // early out for small numbers and NaNs
-    if( ! (reference_fabs(x) > HEX_DBL( +, 1, 0, -, 24 )) )
-        return x;
+    if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 24))) return x;
 
     // early out for large negative numbers
-    if( x < -130.0 )
-        return -1.0;
+    if (x < -130.0) return -1.0;
 
     // early out for large positive numbers
-    if( x > 100.0 )
-        return INFINITY;
+    if (x > 100.0) return INFINITY;
 
     // separate x into fractional and integer parts
-    double i = reference_rint( x );        // round to nearest integer
-    double f = x - i;            // -0.5 <= f <= 0.5
+    double i = reference_rint(x); // round to nearest integer
+    double f = x - i; // -0.5 <= f <= 0.5
 
     // reduce f to the range -0.0625 .. f.. 0.0625
-    int index = (int) (f * 16.0) + 8;       // 0...16
+    int index = (int)(f * 16.0) + 8; // 0...16
 
-    static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625,
-                                           0.0,
-                                          +0.0625, +0.125, +0.1875, +0.25, +0.3125, +0.375, +0.4375, +0.5  };
+    static const double reduction[17] = { -0.5,  -0.4375, -0.375, -0.3125,
+                                          -0.25, -0.1875, -0.125, -0.0625,
+                                          0.0,   +0.0625, +0.125, +0.1875,
+                                          +0.25, +0.3125, +0.375, +0.4375,
+                                          +0.5 };
 
 
     // exponentials[i] = expm1(reduction[i])
-    static const double exponentials[17] = {    HEX_DBL( -, 1, 92e9a0720d3ec, -, 2 ),    HEX_DBL( -, 1, 6adb1cd9205ee, -, 2 ),
-                                                HEX_DBL( -, 1, 40373d42ce2e3, -, 2 ),    HEX_DBL( -, 1, 12d35a41ba104, -, 2 ),
-                                                HEX_DBL( -, 1, c5041854df7d4, -, 3 ),    HEX_DBL( -, 1, 5e25fb4fde211, -, 3 ),
-                                                HEX_DBL( -, 1, e14aed893eef4, -, 4 ),    HEX_DBL( -, 1, f0540438fd5c3, -, 5 ),
-                                                HEX_DBL( +, 0, 0,             +, 0 ),
-                                                HEX_DBL( +, 1, 082b577d34ed8, -, 4 ),    HEX_DBL( +, 1, 10b022db7ae68, -, 3 ),
-                                                HEX_DBL( +, 1, a65c0b85ac1a9, -, 3 ),    HEX_DBL( +, 1, 22d78f0fa061a, -, 2 ),
-                                                HEX_DBL( +, 1, 77a45d8117fd5, -, 2 ),    HEX_DBL( +, 1, d1e944f6fbdaa, -, 2 ),
-                                                HEX_DBL( +, 1, 190048ef6002,  -, 1 ),    HEX_DBL( +, 1, 4c2531c3c0d38, -, 1 ),
-                                            };
+    static const double exponentials[17] = {
+        HEX_DBL(-, 1, 92e9a0720d3ec, -, 2),
+        HEX_DBL(-, 1, 6adb1cd9205ee, -, 2),
+        HEX_DBL(-, 1, 40373d42ce2e3, -, 2),
+        HEX_DBL(-, 1, 12d35a41ba104, -, 2),
+        HEX_DBL(-, 1, c5041854df7d4, -, 3),
+        HEX_DBL(-, 1, 5e25fb4fde211, -, 3),
+        HEX_DBL(-, 1, e14aed893eef4, -, 4),
+        HEX_DBL(-, 1, f0540438fd5c3, -, 5),
+        HEX_DBL(+, 0, 0, +, 0),
+        HEX_DBL(+, 1, 082b577d34ed8, -, 4),
+        HEX_DBL(+, 1, 10b022db7ae68, -, 3),
+        HEX_DBL(+, 1, a65c0b85ac1a9, -, 3),
+        HEX_DBL(+, 1, 22d78f0fa061a, -, 2),
+        HEX_DBL(+, 1, 77a45d8117fd5, -, 2),
+        HEX_DBL(+, 1, d1e944f6fbdaa, -, 2),
+        HEX_DBL(+, 1, 190048ef6002, -, 1),
+        HEX_DBL(+, 1, 4c2531c3c0d38, -, 1),
+    };
 
 
     f -= reduction[index];
@@ -1297,223 +1393,368 @@ double reference_expm1( double x )
     // find expm1(f)
     // calculate as p(f) = (exp(f)-1)/f
     //              expm1(f) = f * p(f)
-    // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over the range +- 0.0625
-    double p = 0.999999999999998001599 +
-               (0.499999999999839628284 +
-               (0.166666666672817459505 +
-               (0.041666666612283048687 +
-               (0.008333330214567431435 +
-               (0.001389005319303770070 + 0.000198833381525156667 * f)*f)*f)*f)*f)*f;
+    // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over
+    // the range +- 0.0625
+    double p = 0.999999999999998001599
+        + (0.499999999999839628284
+           + (0.166666666672817459505
+              + (0.041666666612283048687
+                 + (0.008333330214567431435
+                    + (0.001389005319303770070 + 0.000198833381525156667 * f)
+                        * f)
+                     * f)
+                  * f)
+               * f)
+            * f;
     f *= p; // expm1( reduced f )
 
     // expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1
-    //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index] + 1 -1
-    //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index]
-    f +=  exponentials[index] + f * exponentials[index];
+    //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) +
+    //          exponentials[index] + 1 -1 =  exmp1( reduced_f) *
+    //          exponentials[index] + exmp1( reduced_f) + exponentials[index]
+    f += exponentials[index] + f * exponentials[index];
 
     // scale by e ** i
-    int exponent = (int) i;
-    if( 0 == exponent )
-        return f;       // precise answer for x near 1
+    int exponent = (int)i;
+    if (0 == exponent) return f; // precise answer for x near 1
 
     // table of e**(i-150)
-    static const double exp_table[128+150+1] =
-    {
-        HEX_DBL( +, 1, 82e16284f5ec5, -, 217 ),    HEX_DBL( +, 1, 06e9996332ba1, -, 215 ),
-        HEX_DBL( +, 1, 6555cb289e44b, -, 214 ),    HEX_DBL( +, 1, e5ab364643354, -, 213 ),
-        HEX_DBL( +, 1, 4a0bd18e64df7, -, 211 ),    HEX_DBL( +, 1, c094499cc578e, -, 210 ),
-        HEX_DBL( +, 1, 30d759323998c, -, 208 ),    HEX_DBL( +, 1, 9e5278ab1d4cf, -, 207 ),
-        HEX_DBL( +, 1, 198fa3f30be25, -, 205 ),    HEX_DBL( +, 1, 7eae636d6144e, -, 204 ),
-        HEX_DBL( +, 1, 040f1036f4863, -, 202 ),    HEX_DBL( +, 1, 6174e477a895f, -, 201 ),
-        HEX_DBL( +, 1, e065b82dd95a,  -, 200 ),    HEX_DBL( +, 1, 4676be491d129, -, 198 ),
-        HEX_DBL( +, 1, bbb5da5f7c823, -, 197 ),    HEX_DBL( +, 1, 2d884eef5fdcb, -, 195 ),
-        HEX_DBL( +, 1, 99d3397ab8371, -, 194 ),    HEX_DBL( +, 1, 1681497ed15b3, -, 192 ),
-        HEX_DBL( +, 1, 7a870f597fdbd, -, 191 ),    HEX_DBL( +, 1, 013c74edba307, -, 189 ),
-        HEX_DBL( +, 1, 5d9ec4ada7938, -, 188 ),    HEX_DBL( +, 1, db2edfd20fa7c, -, 187 ),
-        HEX_DBL( +, 1, 42eb9f39afb0b, -, 185 ),    HEX_DBL( +, 1, b6e4f282b43f4, -, 184 ),
-        HEX_DBL( +, 1, 2a42764857b19, -, 182 ),    HEX_DBL( +, 1, 9560792d19314, -, 181 ),
-        HEX_DBL( +, 1, 137b6ce8e052c, -, 179 ),    HEX_DBL( +, 1, 766b45dd84f18, -, 178 ),
-        HEX_DBL( +, 1, fce362fe6e7d,  -, 177 ),    HEX_DBL( +, 1, 59d34dd8a5473, -, 175 ),
-        HEX_DBL( +, 1, d606847fc727a, -, 174 ),    HEX_DBL( +, 1, 3f6a58b795de3, -, 172 ),
-        HEX_DBL( +, 1, b2216c6efdac1, -, 171 ),    HEX_DBL( +, 1, 2705b5b153fb8, -, 169 ),
-        HEX_DBL( +, 1, 90fa1509bd50d, -, 168 ),    HEX_DBL( +, 1, 107df698da211, -, 166 ),
-        HEX_DBL( +, 1, 725ae6e7b9d35, -, 165 ),    HEX_DBL( +, 1, f75d6040aeff6, -, 164 ),
-        HEX_DBL( +, 1, 56126259e093c, -, 162 ),    HEX_DBL( +, 1, d0ec7df4f7bd4, -, 161 ),
-        HEX_DBL( +, 1, 3bf2cf6722e46, -, 159 ),    HEX_DBL( +, 1, ad6b22f55db42, -, 158 ),
-        HEX_DBL( +, 1, 23d1f3e5834a,  -, 156 ),    HEX_DBL( +, 1, 8c9feab89b876, -, 155 ),
-        HEX_DBL( +, 1, 0d88cf37f00dd, -, 153 ),    HEX_DBL( +, 1, 6e55d2bf838a7, -, 152 ),
-        HEX_DBL( +, 1, f1e6b68529e33, -, 151 ),    HEX_DBL( +, 1, 525be4e4e601d, -, 149 ),
-        HEX_DBL( +, 1, cbe0a45f75eb1, -, 148 ),    HEX_DBL( +, 1, 3884e838aea68, -, 146 ),
-        HEX_DBL( +, 1, a8c1f14e2af5d, -, 145 ),    HEX_DBL( +, 1, 20a717e64a9bd, -, 143 ),
-        HEX_DBL( +, 1, 8851d84118908, -, 142 ),    HEX_DBL( +, 1, 0a9bdfb02d24,  -, 140 ),
-        HEX_DBL( +, 1, 6a5bea046b42e, -, 139 ),    HEX_DBL( +, 1, ec7f3b269efa8, -, 138 ),
-        HEX_DBL( +, 1, 4eafb87eab0f2, -, 136 ),    HEX_DBL( +, 1, c6e2d05bbc,    -, 135 ),
-        HEX_DBL( +, 1, 35208867c2683, -, 133 ),    HEX_DBL( +, 1, a425b317eeacd, -, 132 ),
-        HEX_DBL( +, 1, 1d8508fa8246a, -, 130 ),    HEX_DBL( +, 1, 840fbc08fdc8a, -, 129 ),
-        HEX_DBL( +, 1, 07b7112bc1ffe, -, 127 ),    HEX_DBL( +, 1, 666d0dad2961d, -, 126 ),
-        HEX_DBL( +, 1, e726c3f64d0fe, -, 125 ),    HEX_DBL( +, 1, 4b0dc07cabf98, -, 123 ),
-        HEX_DBL( +, 1, c1f2daf3b6a46, -, 122 ),    HEX_DBL( +, 1, 31c5957a47de2, -, 120 ),
-        HEX_DBL( +, 1, 9f96445648b9f, -, 119 ),    HEX_DBL( +, 1, 1a6baeadb4fd1, -, 117 ),
-        HEX_DBL( +, 1, 7fd974d372e45, -, 116 ),    HEX_DBL( +, 1, 04da4d1452919, -, 114 ),
-        HEX_DBL( +, 1, 62891f06b345,  -, 113 ),    HEX_DBL( +, 1, e1dd273aa8a4a, -, 112 ),
-        HEX_DBL( +, 1, 4775e0840bfdd, -, 110 ),    HEX_DBL( +, 1, bd109d9d94bda, -, 109 ),
-        HEX_DBL( +, 1, 2e73f53fba844, -, 107 ),    HEX_DBL( +, 1, 9b138170d6bfe, -, 106 ),
-        HEX_DBL( +, 1, 175af0cf60ec5, -, 104 ),    HEX_DBL( +, 1, 7baee1bffa80b, -, 103 ),
-        HEX_DBL( +, 1, 02057d1245ceb, -, 101 ),    HEX_DBL( +, 1, 5eafffb34ba31, -, 100 ),
-        HEX_DBL( +, 1, dca23bae16424, -, 99 ),    HEX_DBL( +, 1, 43e7fc88b8056, -, 97 ),
-        HEX_DBL( +, 1, b83bf23a9a9eb, -, 96 ),    HEX_DBL( +, 1, 2b2b8dd05b318, -, 94 ),
-        HEX_DBL( +, 1, 969d47321e4cc, -, 93 ),    HEX_DBL( +, 1, 1452b7723aed2, -, 91 ),
-        HEX_DBL( +, 1, 778fe2497184c, -, 90 ),    HEX_DBL( +, 1, fe7116182e9cc, -, 89 ),
-        HEX_DBL( +, 1, 5ae191a99585a, -, 87 ),    HEX_DBL( +, 1, d775d87da854d, -, 86 ),
-        HEX_DBL( +, 1, 4063f8cc8bb98, -, 84 ),    HEX_DBL( +, 1, b374b315f87c1, -, 83 ),
-        HEX_DBL( +, 1, 27ec458c65e3c, -, 81 ),    HEX_DBL( +, 1, 923372c67a074, -, 80 ),
-        HEX_DBL( +, 1, 1152eaeb73c08, -, 78 ),    HEX_DBL( +, 1, 737c5645114b5, -, 77 ),
-        HEX_DBL( +, 1, f8e6c24b5592e, -, 76 ),    HEX_DBL( +, 1, 571db733a9d61, -, 74 ),
-        HEX_DBL( +, 1, d257d547e083f, -, 73 ),    HEX_DBL( +, 1, 3ce9b9de78f85, -, 71 ),
-        HEX_DBL( +, 1, aebabae3a41b5, -, 70 ),    HEX_DBL( +, 1, 24b6031b49bda, -, 68 ),
-        HEX_DBL( +, 1, 8dd5e1bb09d7e, -, 67 ),    HEX_DBL( +, 1, 0e5b73d1ff53d, -, 65 ),
-        HEX_DBL( +, 1, 6f741de1748ec, -, 64 ),    HEX_DBL( +, 1, f36bd37f42f3e, -, 63 ),
-        HEX_DBL( +, 1, 536452ee2f75c, -, 61 ),    HEX_DBL( +, 1, cd480a1b7482,  -, 60 ),
-        HEX_DBL( +, 1, 39792499b1a24, -, 58 ),    HEX_DBL( +, 1, aa0de4bf35b38, -, 57 ),
-        HEX_DBL( +, 1, 2188ad6ae3303, -, 55 ),    HEX_DBL( +, 1, 898471fca6055, -, 54 ),
-        HEX_DBL( +, 1, 0b6c3afdde064, -, 52 ),    HEX_DBL( +, 1, 6b7719a59f0e,  -, 51 ),
-        HEX_DBL( +, 1, ee001eed62aa, -, 50 ),    HEX_DBL( +, 1, 4fb547c775da8, -, 48 ),
-        HEX_DBL( +, 1, c8464f7616468, -, 47 ),    HEX_DBL( +, 1, 36121e24d3bba, -, 45 ),
-        HEX_DBL( +, 1, a56e0c2ac7f75, -, 44 ),    HEX_DBL( +, 1, 1e642baeb84a,  -, 42 ),
-        HEX_DBL( +, 1, 853f01d6d53ba, -, 41 ),    HEX_DBL( +, 1, 0885298767e9a, -, 39 ),
-        HEX_DBL( +, 1, 67852a7007e42, -, 38 ),    HEX_DBL( +, 1, e8a37a45fc32e, -, 37 ),
-        HEX_DBL( +, 1, 4c1078fe9228a, -, 35 ),    HEX_DBL( +, 1, c3527e433fab1, -, 34 ),
-        HEX_DBL( +, 1, 32b48bf117da2, -, 32 ),    HEX_DBL( +, 1, a0db0d0ddb3ec, -, 31 ),
-        HEX_DBL( +, 1, 1b48655f37267, -, 29 ),    HEX_DBL( +, 1, 81056ff2c5772, -, 28 ),
-        HEX_DBL( +, 1, 05a628c699fa1, -, 26 ),    HEX_DBL( +, 1, 639e3175a689d, -, 25 ),
-        HEX_DBL( +, 1, e355bbaee85cb, -, 24 ),    HEX_DBL( +, 1, 4875ca227ec38, -, 22 ),
-        HEX_DBL( +, 1, be6c6fdb01612, -, 21 ),    HEX_DBL( +, 1, 2f6053b981d98, -, 19 ),
-        HEX_DBL( +, 1, 9c54c3b43bc8b, -, 18 ),    HEX_DBL( +, 1, 18354238f6764, -, 16 ),
-        HEX_DBL( +, 1, 7cd79b5647c9b, -, 15 ),    HEX_DBL( +, 1, 02cf22526545a, -, 13 ),
-        HEX_DBL( +, 1, 5fc21041027ad, -, 12 ),    HEX_DBL( +, 1, de16b9c24a98f, -, 11 ),
-        HEX_DBL( +, 1, 44e51f113d4d6, -, 9 ),    HEX_DBL( +, 1, b993fe00d5376, -, 8 ),
-        HEX_DBL( +, 1, 2c155b8213cf4, -, 6 ),    HEX_DBL( +, 1, 97db0ccceb0af, -, 5 ),
-        HEX_DBL( +, 1, 152aaa3bf81cc, -, 3 ),    HEX_DBL( +, 1, 78b56362cef38, -, 2 ),
-        HEX_DBL( +, 1, 0, +, 0 ),                HEX_DBL( +, 1, 5bf0a8b145769, +, 1 ),
-        HEX_DBL( +, 1, d8e64b8d4ddae, +, 2 ),    HEX_DBL( +, 1, 415e5bf6fb106, +, 4 ),
-        HEX_DBL( +, 1, b4c902e273a58, +, 5 ),    HEX_DBL( +, 1, 28d389970338f, +, 7 ),
-        HEX_DBL( +, 1, 936dc5690c08f, +, 8 ),    HEX_DBL( +, 1, 122885aaeddaa, +, 10 ),
-        HEX_DBL( +, 1, 749ea7d470c6e, +, 11 ),    HEX_DBL( +, 1, fa7157c470f82, +, 12 ),
-        HEX_DBL( +, 1, 5829dcf95056,  +, 14 ),    HEX_DBL( +, 1, d3c4488ee4f7f, +, 15 ),
-        HEX_DBL( +, 1, 3de1654d37c9a, +, 17 ),    HEX_DBL( +, 1, b00b5916ac955, +, 18 ),
-        HEX_DBL( +, 1, 259ac48bf05d7, +, 20 ),    HEX_DBL( +, 1, 8f0ccafad2a87, +, 21 ),
-        HEX_DBL( +, 1, 0f2ebd0a8002,  +, 23 ),    HEX_DBL( +, 1, 709348c0ea4f9, +, 24 ),
-        HEX_DBL( +, 1, f4f22091940bd, +, 25 ),    HEX_DBL( +, 1, 546d8f9ed26e1, +, 27 ),
-        HEX_DBL( +, 1, ceb088b68e804, +, 28 ),    HEX_DBL( +, 1, 3a6e1fd9eecfd, +, 30 ),
-        HEX_DBL( +, 1, ab5adb9c436,   +, 31 ),    HEX_DBL( +, 1, 226af33b1fdc1, +, 33 ),
-        HEX_DBL( +, 1, 8ab7fb5475fb7, +, 34 ),    HEX_DBL( +, 1, 0c3d3920962c9, +, 36 ),
-        HEX_DBL( +, 1, 6c932696a6b5d, +, 37 ),    HEX_DBL( +, 1, ef822f7f6731d, +, 38 ),
-        HEX_DBL( +, 1, 50bba3796379a, +, 40 ),    HEX_DBL( +, 1, c9aae4631c056, +, 41 ),
-        HEX_DBL( +, 1, 370470aec28ed, +, 43 ),    HEX_DBL( +, 1, a6b765d8cdf6d, +, 44 ),
-        HEX_DBL( +, 1, 1f43fcc4b662c, +, 46 ),    HEX_DBL( +, 1, 866f34a725782, +, 47 ),
-        HEX_DBL( +, 1, 0953e2f3a1ef7, +, 49 ),    HEX_DBL( +, 1, 689e221bc8d5b, +, 50 ),
-        HEX_DBL( +, 1, ea215a1d20d76, +, 51 ),    HEX_DBL( +, 1, 4d13fbb1a001a, +, 53 ),
-        HEX_DBL( +, 1, c4b334617cc67, +, 54 ),    HEX_DBL( +, 1, 33a43d282a519, +, 56 ),
-        HEX_DBL( +, 1, a220d397972eb, +, 57 ),    HEX_DBL( +, 1, 1c25c88df6862, +, 59 ),
-        HEX_DBL( +, 1, 8232558201159, +, 60 ),    HEX_DBL( +, 1, 0672a3c9eb871, +, 62 ),
-        HEX_DBL( +, 1, 64b41c6d37832, +, 63 ),    HEX_DBL( +, 1, e4cf766fe49be, +, 64 ),
-        HEX_DBL( +, 1, 49767bc0483e3, +, 66 ),    HEX_DBL( +, 1, bfc951eb8bb76, +, 67 ),
-        HEX_DBL( +, 1, 304d6aeca254b, +, 69 ),    HEX_DBL( +, 1, 9d97010884251, +, 70 ),
-        HEX_DBL( +, 1, 19103e4080b45, +, 72 ),    HEX_DBL( +, 1, 7e013cd114461, +, 73 ),
-        HEX_DBL( +, 1, 03996528e074c, +, 75 ),    HEX_DBL( +, 1, 60d4f6fdac731, +, 76 ),
-        HEX_DBL( +, 1, df8c5af17ba3b, +, 77 ),    HEX_DBL( +, 1, 45e3076d61699, +, 79 ),
-        HEX_DBL( +, 1, baed16a6e0da7, +, 80 ),    HEX_DBL( +, 1, 2cffdfebde1a1, +, 82 ),
-        HEX_DBL( +, 1, 9919cabefcb69, +, 83 ),    HEX_DBL( +, 1, 160345c9953e3, +, 85 ),
-        HEX_DBL( +, 1, 79dbc9dc53c66, +, 86 ),    HEX_DBL( +, 1, 00c810d464097, +, 88 ),
-        HEX_DBL( +, 1, 5d009394c5c27, +, 89 ),    HEX_DBL( +, 1, da57de8f107a8, +, 90 ),
-        HEX_DBL( +, 1, 425982cf597cd, +, 92 ),    HEX_DBL( +, 1, b61e5ca3a5e31, +, 93 ),
-        HEX_DBL( +, 1, 29bb825dfcf87, +, 95 ),    HEX_DBL( +, 1, 94a90db0d6fe2, +, 96 ),
-        HEX_DBL( +, 1, 12fec759586fd, +, 98 ),    HEX_DBL( +, 1, 75c1dc469e3af, +, 99 ),
-        HEX_DBL( +, 1, fbfd219c43b04, +, 100 ),    HEX_DBL( +, 1, 5936d44e1a146, +, 102 ),
-        HEX_DBL( +, 1, d531d8a7ee79c, +, 103 ),    HEX_DBL( +, 1, 3ed9d24a2d51b, +, 105 ),
-        HEX_DBL( +, 1, b15cfe5b6e17b, +, 106 ),    HEX_DBL( +, 1, 268038c2c0e,   +, 108 ),
-        HEX_DBL( +, 1, 9044a73545d48, +, 109 ),    HEX_DBL( +, 1, 1002ab6218b38, +, 111 ),
-        HEX_DBL( +, 1, 71b3540cbf921, +, 112 ),    HEX_DBL( +, 1, f6799ea9c414a, +, 113 ),
-        HEX_DBL( +, 1, 55779b984f3eb, +, 115 ),    HEX_DBL( +, 1, d01a210c44aa4, +, 116 ),
-        HEX_DBL( +, 1, 3b63da8e9121,  +, 118 ),    HEX_DBL( +, 1, aca8d6b0116b8, +, 119 ),
-        HEX_DBL( +, 1, 234de9e0c74e9, +, 121 ),    HEX_DBL( +, 1, 8bec7503ca477, +, 122 ),
-        HEX_DBL( +, 1, 0d0eda9796b9,  +, 124 ),    HEX_DBL( +, 1, 6db0118477245, +, 125 ),
-        HEX_DBL( +, 1, f1056dc7bf22d, +, 126 ),    HEX_DBL( +, 1, 51c2cc3433801, +, 128 ),
-        HEX_DBL( +, 1, cb108ffbec164, +, 129 ),    HEX_DBL( +, 1, 37f780991b584, +, 131 ),
-        HEX_DBL( +, 1, a801c0ea8ac4d, +, 132 ),    HEX_DBL( +, 1, 20247cc4c46c1, +, 134 ),
-        HEX_DBL( +, 1, 87a0553328015, +, 135 ),    HEX_DBL( +, 1, 0a233dee4f9bb, +, 137 ),
-        HEX_DBL( +, 1, 69b7f55b808ba, +, 138 ),    HEX_DBL( +, 1, eba064644060a, +, 139 ),
-        HEX_DBL( +, 1, 4e184933d9364, +, 141 ),    HEX_DBL( +, 1, c614fe2531841, +, 142 ),
-        HEX_DBL( +, 1, 3494a9b171bf5, +, 144 ),    HEX_DBL( +, 1, a36798b9d969b, +, 145 ),
-        HEX_DBL( +, 1, 1d03d8c0c04af, +, 147 ),    HEX_DBL( +, 1, 836026385c974, +, 148 ),
-        HEX_DBL( +, 1, 073fbe9ac901d, +, 150 ),    HEX_DBL( +, 1, 65cae0969f286, +, 151 ),
-        HEX_DBL( +, 1, e64a58639cae8, +, 152 ),    HEX_DBL( +, 1, 4a77f5f9b50f9, +, 154 ),
-        HEX_DBL( +, 1, c12744a3a28e3, +, 155 ),    HEX_DBL( +, 1, 313b3b6978e85, +, 157 ),
-        HEX_DBL( +, 1, 9eda3a31e587e, +, 158 ),    HEX_DBL( +, 1, 19ebe56b56453, +, 160 ),
-        HEX_DBL( +, 1, 7f2bc6e599b7e, +, 161 ),    HEX_DBL( +, 1, 04644610df2ff, +, 163 ),
-        HEX_DBL( +, 1, 61e8b490ac4e6, +, 164 ),    HEX_DBL( +, 1, e103201f299b3, +, 165 ),
-        HEX_DBL( +, 1, 46e1b637beaf5, +, 167 ),    HEX_DBL( +, 1, bc473cfede104, +, 168 ),
-        HEX_DBL( +, 1, 2deb1b9c85e2d, +, 170 ),    HEX_DBL( +, 1, 9a5981ca67d1,  +, 171 ),
-        HEX_DBL( +, 1, 16dc8a9ef670b, +, 173 ),    HEX_DBL( +, 1, 7b03166942309, +, 174 ),
-        HEX_DBL( +, 1, 0190be03150a7, +, 176 ),    HEX_DBL( +, 1, 5e1152f9a8119, +, 177 ),
-        HEX_DBL( +, 1, dbca9263f8487, +, 178 ),    HEX_DBL( +, 1, 43556dee93bee, +, 180 ),
-        HEX_DBL( +, 1, b774c12967dfa, +, 181 ),    HEX_DBL( +, 1, 2aa4306e922c2, +, 183 ),
-        HEX_DBL( +, 1, 95e54c5dd4217, +, 184 )    };
-
-    // scale by e**i --  (expm1(f) + 1)*e**i - 1  = expm1(f) * e**i + e**i - 1 = e**i
-    return exp_table[exponent+150] + (f * exp_table[exponent+150] - 1.0);
-}
-
-
-double reference_fmax( double x, double y )
-{
-    if( isnan(y) )
-        return x;
+    static const double exp_table[128 + 150 + 1] = {
+        HEX_DBL(+, 1, 82e16284f5ec5, -, 217),
+        HEX_DBL(+, 1, 06e9996332ba1, -, 215),
+        HEX_DBL(+, 1, 6555cb289e44b, -, 214),
+        HEX_DBL(+, 1, e5ab364643354, -, 213),
+        HEX_DBL(+, 1, 4a0bd18e64df7, -, 211),
+        HEX_DBL(+, 1, c094499cc578e, -, 210),
+        HEX_DBL(+, 1, 30d759323998c, -, 208),
+        HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207),
+        HEX_DBL(+, 1, 198fa3f30be25, -, 205),
+        HEX_DBL(+, 1, 7eae636d6144e, -, 204),
+        HEX_DBL(+, 1, 040f1036f4863, -, 202),
+        HEX_DBL(+, 1, 6174e477a895f, -, 201),
+        HEX_DBL(+, 1, e065b82dd95a, -, 200),
+        HEX_DBL(+, 1, 4676be491d129, -, 198),
+        HEX_DBL(+, 1, bbb5da5f7c823, -, 197),
+        HEX_DBL(+, 1, 2d884eef5fdcb, -, 195),
+        HEX_DBL(+, 1, 99d3397ab8371, -, 194),
+        HEX_DBL(+, 1, 1681497ed15b3, -, 192),
+        HEX_DBL(+, 1, 7a870f597fdbd, -, 191),
+        HEX_DBL(+, 1, 013c74edba307, -, 189),
+        HEX_DBL(+, 1, 5d9ec4ada7938, -, 188),
+        HEX_DBL(+, 1, db2edfd20fa7c, -, 187),
+        HEX_DBL(+, 1, 42eb9f39afb0b, -, 185),
+        HEX_DBL(+, 1, b6e4f282b43f4, -, 184),
+        HEX_DBL(+, 1, 2a42764857b19, -, 182),
+        HEX_DBL(+, 1, 9560792d19314, -, 181),
+        HEX_DBL(+, 1, 137b6ce8e052c, -, 179),
+        HEX_DBL(+, 1, 766b45dd84f18, -, 178),
+        HEX_DBL(+, 1, fce362fe6e7d, -, 177),
+        HEX_DBL(+, 1, 59d34dd8a5473, -, 175),
+        HEX_DBL(+, 1, d606847fc727a, -, 174),
+        HEX_DBL(+, 1, 3f6a58b795de3, -, 172),
+        HEX_DBL(+, 1, b2216c6efdac1, -, 171),
+        HEX_DBL(+, 1, 2705b5b153fb8, -, 169),
+        HEX_DBL(+, 1, 90fa1509bd50d, -, 168),
+        HEX_DBL(+, 1, 107df698da211, -, 166),
+        HEX_DBL(+, 1, 725ae6e7b9d35, -, 165),
+        HEX_DBL(+, 1, f75d6040aeff6, -, 164),
+        HEX_DBL(+, 1, 56126259e093c, -, 162),
+        HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161),
+        HEX_DBL(+, 1, 3bf2cf6722e46, -, 159),
+        HEX_DBL(+, 1, ad6b22f55db42, -, 158),
+        HEX_DBL(+, 1, 23d1f3e5834a, -, 156),
+        HEX_DBL(+, 1, 8c9feab89b876, -, 155),
+        HEX_DBL(+, 1, 0d88cf37f00dd, -, 153),
+        HEX_DBL(+, 1, 6e55d2bf838a7, -, 152),
+        HEX_DBL(+, 1, f1e6b68529e33, -, 151),
+        HEX_DBL(+, 1, 525be4e4e601d, -, 149),
+        HEX_DBL(+, 1, cbe0a45f75eb1, -, 148),
+        HEX_DBL(+, 1, 3884e838aea68, -, 146),
+        HEX_DBL(+, 1, a8c1f14e2af5d, -, 145),
+        HEX_DBL(+, 1, 20a717e64a9bd, -, 143),
+        HEX_DBL(+, 1, 8851d84118908, -, 142),
+        HEX_DBL(+, 1, 0a9bdfb02d24, -, 140),
+        HEX_DBL(+, 1, 6a5bea046b42e, -, 139),
+        HEX_DBL(+, 1, ec7f3b269efa8, -, 138),
+        HEX_DBL(+, 1, 4eafb87eab0f2, -, 136),
+        HEX_DBL(+, 1, c6e2d05bbc, -, 135),
+        HEX_DBL(+, 1, 35208867c2683, -, 133),
+        HEX_DBL(+, 1, a425b317eeacd, -, 132),
+        HEX_DBL(+, 1, 1d8508fa8246a, -, 130),
+        HEX_DBL(+, 1, 840fbc08fdc8a, -, 129),
+        HEX_DBL(+, 1, 07b7112bc1ffe, -, 127),
+        HEX_DBL(+, 1, 666d0dad2961d, -, 126),
+        HEX_DBL(+, 1, e726c3f64d0fe, -, 125),
+        HEX_DBL(+, 1, 4b0dc07cabf98, -, 123),
+        HEX_DBL(+, 1, c1f2daf3b6a46, -, 122),
+        HEX_DBL(+, 1, 31c5957a47de2, -, 120),
+        HEX_DBL(+, 1, 9f96445648b9f, -, 119),
+        HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117),
+        HEX_DBL(+, 1, 7fd974d372e45, -, 116),
+        HEX_DBL(+, 1, 04da4d1452919, -, 114),
+        HEX_DBL(+, 1, 62891f06b345, -, 113),
+        HEX_DBL(+, 1, e1dd273aa8a4a, -, 112),
+        HEX_DBL(+, 1, 4775e0840bfdd, -, 110),
+        HEX_DBL(+, 1, bd109d9d94bda, -, 109),
+        HEX_DBL(+, 1, 2e73f53fba844, -, 107),
+        HEX_DBL(+, 1, 9b138170d6bfe, -, 106),
+        HEX_DBL(+, 1, 175af0cf60ec5, -, 104),
+        HEX_DBL(+, 1, 7baee1bffa80b, -, 103),
+        HEX_DBL(+, 1, 02057d1245ceb, -, 101),
+        HEX_DBL(+, 1, 5eafffb34ba31, -, 100),
+        HEX_DBL(+, 1, dca23bae16424, -, 99),
+        HEX_DBL(+, 1, 43e7fc88b8056, -, 97),
+        HEX_DBL(+, 1, b83bf23a9a9eb, -, 96),
+        HEX_DBL(+, 1, 2b2b8dd05b318, -, 94),
+        HEX_DBL(+, 1, 969d47321e4cc, -, 93),
+        HEX_DBL(+, 1, 1452b7723aed2, -, 91),
+        HEX_DBL(+, 1, 778fe2497184c, -, 90),
+        HEX_DBL(+, 1, fe7116182e9cc, -, 89),
+        HEX_DBL(+, 1, 5ae191a99585a, -, 87),
+        HEX_DBL(+, 1, d775d87da854d, -, 86),
+        HEX_DBL(+, 1, 4063f8cc8bb98, -, 84),
+        HEX_DBL(+, 1, b374b315f87c1, -, 83),
+        HEX_DBL(+, 1, 27ec458c65e3c, -, 81),
+        HEX_DBL(+, 1, 923372c67a074, -, 80),
+        HEX_DBL(+, 1, 1152eaeb73c08, -, 78),
+        HEX_DBL(+, 1, 737c5645114b5, -, 77),
+        HEX_DBL(+, 1, f8e6c24b5592e, -, 76),
+        HEX_DBL(+, 1, 571db733a9d61, -, 74),
+        HEX_DBL(+, 1, d257d547e083f, -, 73),
+        HEX_DBL(+, 1, 3ce9b9de78f85, -, 71),
+        HEX_DBL(+, 1, aebabae3a41b5, -, 70),
+        HEX_DBL(+, 1, 24b6031b49bda, -, 68),
+        HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67),
+        HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65),
+        HEX_DBL(+, 1, 6f741de1748ec, -, 64),
+        HEX_DBL(+, 1, f36bd37f42f3e, -, 63),
+        HEX_DBL(+, 1, 536452ee2f75c, -, 61),
+        HEX_DBL(+, 1, cd480a1b7482, -, 60),
+        HEX_DBL(+, 1, 39792499b1a24, -, 58),
+        HEX_DBL(+, 1, aa0de4bf35b38, -, 57),
+        HEX_DBL(+, 1, 2188ad6ae3303, -, 55),
+        HEX_DBL(+, 1, 898471fca6055, -, 54),
+        HEX_DBL(+, 1, 0b6c3afdde064, -, 52),
+        HEX_DBL(+, 1, 6b7719a59f0e, -, 51),
+        HEX_DBL(+, 1, ee001eed62aa, -, 50),
+        HEX_DBL(+, 1, 4fb547c775da8, -, 48),
+        HEX_DBL(+, 1, c8464f7616468, -, 47),
+        HEX_DBL(+, 1, 36121e24d3bba, -, 45),
+        HEX_DBL(+, 1, a56e0c2ac7f75, -, 44),
+        HEX_DBL(+, 1, 1e642baeb84a, -, 42),
+        HEX_DBL(+, 1, 853f01d6d53ba, -, 41),
+        HEX_DBL(+, 1, 0885298767e9a, -, 39),
+        HEX_DBL(+, 1, 67852a7007e42, -, 38),
+        HEX_DBL(+, 1, e8a37a45fc32e, -, 37),
+        HEX_DBL(+, 1, 4c1078fe9228a, -, 35),
+        HEX_DBL(+, 1, c3527e433fab1, -, 34),
+        HEX_DBL(+, 1, 32b48bf117da2, -, 32),
+        HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31),
+        HEX_DBL(+, 1, 1b48655f37267, -, 29),
+        HEX_DBL(+, 1, 81056ff2c5772, -, 28),
+        HEX_DBL(+, 1, 05a628c699fa1, -, 26),
+        HEX_DBL(+, 1, 639e3175a689d, -, 25),
+        HEX_DBL(+, 1, e355bbaee85cb, -, 24),
+        HEX_DBL(+, 1, 4875ca227ec38, -, 22),
+        HEX_DBL(+, 1, be6c6fdb01612, -, 21),
+        HEX_DBL(+, 1, 2f6053b981d98, -, 19),
+        HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18),
+        HEX_DBL(+, 1, 18354238f6764, -, 16),
+        HEX_DBL(+, 1, 7cd79b5647c9b, -, 15),
+        HEX_DBL(+, 1, 02cf22526545a, -, 13),
+        HEX_DBL(+, 1, 5fc21041027ad, -, 12),
+        HEX_DBL(+, 1, de16b9c24a98f, -, 11),
+        HEX_DBL(+, 1, 44e51f113d4d6, -, 9),
+        HEX_DBL(+, 1, b993fe00d5376, -, 8),
+        HEX_DBL(+, 1, 2c155b8213cf4, -, 6),
+        HEX_DBL(+, 1, 97db0ccceb0af, -, 5),
+        HEX_DBL(+, 1, 152aaa3bf81cc, -, 3),
+        HEX_DBL(+, 1, 78b56362cef38, -, 2),
+        HEX_DBL(+, 1, 0, +, 0),
+        HEX_DBL(+, 1, 5bf0a8b145769, +, 1),
+        HEX_DBL(+, 1, d8e64b8d4ddae, +, 2),
+        HEX_DBL(+, 1, 415e5bf6fb106, +, 4),
+        HEX_DBL(+, 1, b4c902e273a58, +, 5),
+        HEX_DBL(+, 1, 28d389970338f, +, 7),
+        HEX_DBL(+, 1, 936dc5690c08f, +, 8),
+        HEX_DBL(+, 1, 122885aaeddaa, +, 10),
+        HEX_DBL(+, 1, 749ea7d470c6e, +, 11),
+        HEX_DBL(+, 1, fa7157c470f82, +, 12),
+        HEX_DBL(+, 1, 5829dcf95056, +, 14),
+        HEX_DBL(+, 1, d3c4488ee4f7f, +, 15),
+        HEX_DBL(+, 1, 3de1654d37c9a, +, 17),
+        HEX_DBL(+, 1, b00b5916ac955, +, 18),
+        HEX_DBL(+, 1, 259ac48bf05d7, +, 20),
+        HEX_DBL(+, 1, 8f0ccafad2a87, +, 21),
+        HEX_DBL(+, 1, 0f2ebd0a8002, +, 23),
+        HEX_DBL(+, 1, 709348c0ea4f9, +, 24),
+        HEX_DBL(+, 1, f4f22091940bd, +, 25),
+        HEX_DBL(+, 1, 546d8f9ed26e1, +, 27),
+        HEX_DBL(+, 1, ceb088b68e804, +, 28),
+        HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30),
+        HEX_DBL(+, 1, ab5adb9c436, +, 31),
+        HEX_DBL(+, 1, 226af33b1fdc1, +, 33),
+        HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34),
+        HEX_DBL(+, 1, 0c3d3920962c9, +, 36),
+        HEX_DBL(+, 1, 6c932696a6b5d, +, 37),
+        HEX_DBL(+, 1, ef822f7f6731d, +, 38),
+        HEX_DBL(+, 1, 50bba3796379a, +, 40),
+        HEX_DBL(+, 1, c9aae4631c056, +, 41),
+        HEX_DBL(+, 1, 370470aec28ed, +, 43),
+        HEX_DBL(+, 1, a6b765d8cdf6d, +, 44),
+        HEX_DBL(+, 1, 1f43fcc4b662c, +, 46),
+        HEX_DBL(+, 1, 866f34a725782, +, 47),
+        HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49),
+        HEX_DBL(+, 1, 689e221bc8d5b, +, 50),
+        HEX_DBL(+, 1, ea215a1d20d76, +, 51),
+        HEX_DBL(+, 1, 4d13fbb1a001a, +, 53),
+        HEX_DBL(+, 1, c4b334617cc67, +, 54),
+        HEX_DBL(+, 1, 33a43d282a519, +, 56),
+        HEX_DBL(+, 1, a220d397972eb, +, 57),
+        HEX_DBL(+, 1, 1c25c88df6862, +, 59),
+        HEX_DBL(+, 1, 8232558201159, +, 60),
+        HEX_DBL(+, 1, 0672a3c9eb871, +, 62),
+        HEX_DBL(+, 1, 64b41c6d37832, +, 63),
+        HEX_DBL(+, 1, e4cf766fe49be, +, 64),
+        HEX_DBL(+, 1, 49767bc0483e3, +, 66),
+        HEX_DBL(+, 1, bfc951eb8bb76, +, 67),
+        HEX_DBL(+, 1, 304d6aeca254b, +, 69),
+        HEX_DBL(+, 1, 9d97010884251, +, 70),
+        HEX_DBL(+, 1, 19103e4080b45, +, 72),
+        HEX_DBL(+, 1, 7e013cd114461, +, 73),
+        HEX_DBL(+, 1, 03996528e074c, +, 75),
+        HEX_DBL(+, 1, 60d4f6fdac731, +, 76),
+        HEX_DBL(+, 1, df8c5af17ba3b, +, 77),
+        HEX_DBL(+, 1, 45e3076d61699, +, 79),
+        HEX_DBL(+, 1, baed16a6e0da7, +, 80),
+        HEX_DBL(+, 1, 2cffdfebde1a1, +, 82),
+        HEX_DBL(+, 1, 9919cabefcb69, +, 83),
+        HEX_DBL(+, 1, 160345c9953e3, +, 85),
+        HEX_DBL(+, 1, 79dbc9dc53c66, +, 86),
+        HEX_DBL(+, 1, 00c810d464097, +, 88),
+        HEX_DBL(+, 1, 5d009394c5c27, +, 89),
+        HEX_DBL(+, 1, da57de8f107a8, +, 90),
+        HEX_DBL(+, 1, 425982cf597cd, +, 92),
+        HEX_DBL(+, 1, b61e5ca3a5e31, +, 93),
+        HEX_DBL(+, 1, 29bb825dfcf87, +, 95),
+        HEX_DBL(+, 1, 94a90db0d6fe2, +, 96),
+        HEX_DBL(+, 1, 12fec759586fd, +, 98),
+        HEX_DBL(+, 1, 75c1dc469e3af, +, 99),
+        HEX_DBL(+, 1, fbfd219c43b04, +, 100),
+        HEX_DBL(+, 1, 5936d44e1a146, +, 102),
+        HEX_DBL(+, 1, d531d8a7ee79c, +, 103),
+        HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105),
+        HEX_DBL(+, 1, b15cfe5b6e17b, +, 106),
+        HEX_DBL(+, 1, 268038c2c0e, +, 108),
+        HEX_DBL(+, 1, 9044a73545d48, +, 109),
+        HEX_DBL(+, 1, 1002ab6218b38, +, 111),
+        HEX_DBL(+, 1, 71b3540cbf921, +, 112),
+        HEX_DBL(+, 1, f6799ea9c414a, +, 113),
+        HEX_DBL(+, 1, 55779b984f3eb, +, 115),
+        HEX_DBL(+, 1, d01a210c44aa4, +, 116),
+        HEX_DBL(+, 1, 3b63da8e9121, +, 118),
+        HEX_DBL(+, 1, aca8d6b0116b8, +, 119),
+        HEX_DBL(+, 1, 234de9e0c74e9, +, 121),
+        HEX_DBL(+, 1, 8bec7503ca477, +, 122),
+        HEX_DBL(+, 1, 0d0eda9796b9, +, 124),
+        HEX_DBL(+, 1, 6db0118477245, +, 125),
+        HEX_DBL(+, 1, f1056dc7bf22d, +, 126),
+        HEX_DBL(+, 1, 51c2cc3433801, +, 128),
+        HEX_DBL(+, 1, cb108ffbec164, +, 129),
+        HEX_DBL(+, 1, 37f780991b584, +, 131),
+        HEX_DBL(+, 1, a801c0ea8ac4d, +, 132),
+        HEX_DBL(+, 1, 20247cc4c46c1, +, 134),
+        HEX_DBL(+, 1, 87a0553328015, +, 135),
+        HEX_DBL(+, 1, 0a233dee4f9bb, +, 137),
+        HEX_DBL(+, 1, 69b7f55b808ba, +, 138),
+        HEX_DBL(+, 1, eba064644060a, +, 139),
+        HEX_DBL(+, 1, 4e184933d9364, +, 141),
+        HEX_DBL(+, 1, c614fe2531841, +, 142),
+        HEX_DBL(+, 1, 3494a9b171bf5, +, 144),
+        HEX_DBL(+, 1, a36798b9d969b, +, 145),
+        HEX_DBL(+, 1, 1d03d8c0c04af, +, 147),
+        HEX_DBL(+, 1, 836026385c974, +, 148),
+        HEX_DBL(+, 1, 073fbe9ac901d, +, 150),
+        HEX_DBL(+, 1, 65cae0969f286, +, 151),
+        HEX_DBL(+, 1, e64a58639cae8, +, 152),
+        HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154),
+        HEX_DBL(+, 1, c12744a3a28e3, +, 155),
+        HEX_DBL(+, 1, 313b3b6978e85, +, 157),
+        HEX_DBL(+, 1, 9eda3a31e587e, +, 158),
+        HEX_DBL(+, 1, 19ebe56b56453, +, 160),
+        HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161),
+        HEX_DBL(+, 1, 04644610df2ff, +, 163),
+        HEX_DBL(+, 1, 61e8b490ac4e6, +, 164),
+        HEX_DBL(+, 1, e103201f299b3, +, 165),
+        HEX_DBL(+, 1, 46e1b637beaf5, +, 167),
+        HEX_DBL(+, 1, bc473cfede104, +, 168),
+        HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170),
+        HEX_DBL(+, 1, 9a5981ca67d1, +, 171),
+        HEX_DBL(+, 1, 16dc8a9ef670b, +, 173),
+        HEX_DBL(+, 1, 7b03166942309, +, 174),
+        HEX_DBL(+, 1, 0190be03150a7, +, 176),
+        HEX_DBL(+, 1, 5e1152f9a8119, +, 177),
+        HEX_DBL(+, 1, dbca9263f8487, +, 178),
+        HEX_DBL(+, 1, 43556dee93bee, +, 180),
+        HEX_DBL(+, 1, b774c12967dfa, +, 181),
+        HEX_DBL(+, 1, 2aa4306e922c2, +, 183),
+        HEX_DBL(+, 1, 95e54c5dd4217, +, 184)
+    };
+
+    // scale by e**i --  (expm1(f) + 1)*e**i - 1  = expm1(f) * e**i + e**i - 1 =
+    // e**i
+    return exp_table[exponent + 150] + (f * exp_table[exponent + 150] - 1.0);
+}
+
+
+double reference_fmax(double x, double y)
+{
+    if (isnan(y)) return x;
 
     return x >= y ? x : y;
 }
 
-double reference_fmin( double x, double y )
+double reference_fmin(double x, double y)
 {
-    if( isnan(y) )
-        return x;
+    if (isnan(y)) return x;
 
     return x <= y ? x : y;
 }
 
-double reference_hypot( double x, double y )
+double reference_hypot(double x, double y)
 {
-    // Since the inputs are actually floats, we don't have to worry about range here
-    if( isinf(x) || isinf(y) )
-        return INFINITY;
+    // Since the inputs are actually floats, we don't have to worry about range
+    // here
+    if (isinf(x) || isinf(y)) return INFINITY;
 
-    return sqrt( x * x + y * y );
+    return sqrt(x * x + y * y);
 }
 
-int    reference_ilogbl( long double x)
+int reference_ilogbl(long double x)
 {
     extern int gDeviceILogb0, gDeviceILogbNaN;
 
     // Since we are just using this to verify double precision, we can
     // use the double precision ilogb here
-    union { double f; cl_ulong u;} u;
-    u.f = (double) x;
+    union {
+        double f;
+        cl_ulong u;
+    } u;
+    u.f = (double)x;
 
     int exponent = (int)(u.u >> 52) & 0x7ff;
-    if( exponent == 0x7ff )
+    if (exponent == 0x7ff)
     {
-        if( u.u & 0x000fffffffffffffULL )
-            return gDeviceILogbNaN;
+        if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
 
         return CL_INT_MAX;
     }
 
-    if( exponent == 0 )
-    {   // deal with denormals
-        u.f =  x * HEX_DBL( +, 1, 0, +, 64 );
+    if (exponent == 0)
+    { // deal with denormals
+        u.f = x * HEX_DBL(+, 1, 0, +, 64);
         exponent = (cl_uint)(u.u >> 52) & 0x7ff;
-        if( exponent == 0 )
-            return gDeviceILogb0;
+        if (exponent == 0) return gDeviceILogb0;
 
         exponent -= 1023 + 64;
         return exponent;
@@ -1522,84 +1763,111 @@ int    reference_ilogbl( long double x)
     return exponent - 1023;
 }
 
-//double reference_log2( double x )
+// double reference_log2( double x )
 //{
 //    return log( x ) * 1.44269504088896340735992468100189214;
 //}
 
 
-double reference_relaxed_log2( double x )
-{
-  return reference_log2(x);
-}
+double reference_relaxed_log2(double x) { return reference_log2(x); }
 
-double reference_log2( double x )
+double reference_log2(double x)
 {
-    if( isnan(x) || x < 0.0 || x == -INFINITY)
-        return cl_make_nan();
+    if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan();
 
-    if( x == 0.0f)
-        return -INFINITY;
+    if (x == 0.0f) return -INFINITY;
 
-    if( x == INFINITY )
-        return INFINITY;
+    if (x == INFINITY) return INFINITY;
 
     double hi, lo;
-    __log2_ep( &hi, &lo, x );
+    __log2_ep(&hi, &lo, x);
     return hi;
 }
 
-double reference_log1p( double x )
-{   // This function is suitable only for verifying log1pf(). It produces several double precision ulps of error.
+double reference_log1p(double x)
+{ // This function is suitable only for verifying log1pf(). It produces several
+  // double precision ulps of error.
 
     // Handle small and NaN
-    if( ! ( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 53 ) ) )
-        return x;
+    if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 53))) return x;
 
     // deal with special values
-    if( x <= -1.0 )
+    if (x <= -1.0)
     {
-        if( x < -1.0 )
-            return cl_make_nan();
+        if (x < -1.0) return cl_make_nan();
         return -INFINITY;
     }
 
     // infinity
-    if( x == INFINITY )
-        return INFINITY;
+    if (x == INFINITY) return INFINITY;
 
-    // High precision result for when near 0, to avoid problems with the reference result falling in the wrong binade.
-    if( reference_fabs(x) < HEX_DBL( +, 1, 0, -, 28 ) )
-        return (1.0 - 0.5 * x) * x;
+    // High precision result for when near 0, to avoid problems with the
+    // reference result falling in the wrong binade.
+    if (reference_fabs(x) < HEX_DBL(+, 1, 0, -, 28)) return (1.0 - 0.5 * x) * x;
 
     // Our polynomial is only good in the region +-2**-4.
     // If we aren't in that range then we need to reduce to be in that range
-    double correctionLo = -0.0;           // correction down stream to compensate for the reduction, if any
-    double correctionHi = -0.0;           // correction down stream to compensate for the exponent, if any
-    if( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 4 ) )
+    double correctionLo =
+        -0.0; // correction down stream to compensate for the reduction, if any
+    double correctionHi =
+        -0.0; // correction down stream to compensate for the exponent, if any
+    if (reference_fabs(x) > HEX_DBL(+, 1, 0, -, 4))
     {
-        x += 1.0;   // double should cover any loss of precision here
+        x += 1.0; // double should cover any loss of precision here
 
         // separate x into (1+f) * 2**i
-        union{ double d; cl_ulong u;} u;        u.d = x;
-        int i = (int) ((u.u >> 52) & 0x7ff) - 1023;
+        union {
+            double d;
+            cl_ulong u;
+        } u;
+        u.d = x;
+        int i = (int)((u.u >> 52) & 0x7ff) - 1023;
         u.u &= 0x000fffffffffffffULL;
-        int index = (int) (u.u >> 48 );
+        int index = (int)(u.u >> 48);
         u.u |= 0x3ff0000000000000ULL;
         double f = u.d;
 
         // further reduce f to be within 1/16 of 1.0
-        static const double scale_table[16] = {                  1.0, HEX_DBL( +, 1, d2d2d2d6e3f79, -, 1 ), HEX_DBL( +, 1, b8e38e42737a1, -, 1 ), HEX_DBL( +, 1, a1af28711adf3, -, 1 ),
-                                                HEX_DBL( +, 1, 8cccccd88dd65, -, 1 ), HEX_DBL( +, 1, 79e79e810ec8f, -, 1 ), HEX_DBL( +, 1, 68ba2e94df404, -, 1 ), HEX_DBL( +, 1, 590b216defb29, -, 1 ),
-                                                HEX_DBL( +, 1, 4aaaaab1500ed, -, 1 ), HEX_DBL( +, 1, 3d70a3e0d6f73, -, 1 ), HEX_DBL( +, 1, 313b13bb39f4f, -, 1 ), HEX_DBL( +, 1, 25ed09823f1cc, -, 1 ),
-                                                HEX_DBL( +, 1, 1b6db6e77457b, -, 1 ), HEX_DBL( +, 1, 11a7b96a3a34f, -, 1 ), HEX_DBL( +, 1, 0888888e46fea, -, 1 ), HEX_DBL( +, 1, 00000038e9862, -, 1 ) };
+        static const double scale_table[16] = {
+            1.0,
+            HEX_DBL(+, 1, d2d2d2d6e3f79, -, 1),
+            HEX_DBL(+, 1, b8e38e42737a1, -, 1),
+            HEX_DBL(+, 1, a1af28711adf3, -, 1),
+            HEX_DBL(+, 1, 8cccccd88dd65, -, 1),
+            HEX_DBL(+, 1, 79e79e810ec8f, -, 1),
+            HEX_DBL(+, 1, 68ba2e94df404, -, 1),
+            HEX_DBL(+, 1, 590b216defb29, -, 1),
+            HEX_DBL(+, 1, 4aaaaab1500ed, -, 1),
+            HEX_DBL(+, 1, 3d70a3e0d6f73, -, 1),
+            HEX_DBL(+, 1, 313b13bb39f4f, -, 1),
+            HEX_DBL(+, 1, 25ed09823f1cc, -, 1),
+            HEX_DBL(+, 1, 1b6db6e77457b, -, 1),
+            HEX_DBL(+, 1, 11a7b96a3a34f, -, 1),
+            HEX_DBL(+, 1, 0888888e46fea, -, 1),
+            HEX_DBL(+, 1, 00000038e9862, -, 1)
+        };
 
         // correction_table[i] = -log( scale_table[i] )
-        // All entries have >= 64 bits of precision (rather than the expected 53)
-        static const double correction_table[16] = {                   -0.0, HEX_DBL( +, 1, 7a5c722c16058, -, 4 ), HEX_DBL( +, 1, 323db16c89ab1, -, 3 ), HEX_DBL( +, 1, a0f87d180629, -, 3 ),
-                                                       HEX_DBL( +, 1, 050279324e17c, -, 2 ), HEX_DBL( +, 1, 36f885bb270b0, -, 2 ), HEX_DBL( +, 1, 669b771b5cc69, -, 2 ), HEX_DBL( +, 1, 94203a6292a05, -, 2 ),
-                                                       HEX_DBL( +, 1, bfb4f9cb333a4, -, 2 ), HEX_DBL( +, 1, e982376ddb80e, -, 2 ), HEX_DBL( +, 1, 08d5d8769b2b2, -, 1 ), HEX_DBL( +, 1, 1c288bc00e0cf, -, 1 ),
-                                                       HEX_DBL( +, 1, 2ec7535b31ecb, -, 1 ), HEX_DBL( +, 1, 40bed0adc63fb, -, 1 ), HEX_DBL( +, 1, 521a5c0330615, -, 1 ), HEX_DBL( +, 1, 62e42f7dd092c, -, 1 ) };
+        // All entries have >= 64 bits of precision (rather than the expected
+        // 53)
+        static const double correction_table[16] = {
+            -0.0,
+            HEX_DBL(+, 1, 7a5c722c16058, -, 4),
+            HEX_DBL(+, 1, 323db16c89ab1, -, 3),
+            HEX_DBL(+, 1, a0f87d180629, -, 3),
+            HEX_DBL(+, 1, 050279324e17c, -, 2),
+            HEX_DBL(+, 1, 36f885bb270b0, -, 2),
+            HEX_DBL(+, 1, 669b771b5cc69, -, 2),
+            HEX_DBL(+, 1, 94203a6292a05, -, 2),
+            HEX_DBL(+, 1, bfb4f9cb333a4, -, 2),
+            HEX_DBL(+, 1, e982376ddb80e, -, 2),
+            HEX_DBL(+, 1, 08d5d8769b2b2, -, 1),
+            HEX_DBL(+, 1, 1c288bc00e0cf, -, 1),
+            HEX_DBL(+, 1, 2ec7535b31ecb, -, 1),
+            HEX_DBL(+, 1, 40bed0adc63fb, -, 1),
+            HEX_DBL(+, 1, 521a5c0330615, -, 1),
+            HEX_DBL(+, 1, 62e42f7dd092c, -, 1)
+        };
 
         f *= scale_table[index];
         correctionLo = correction_table[index];
@@ -1611,17 +1879,25 @@ double reference_log1p( double x )
     }
 
 
-    // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x = [-1/16, 1/16]
+    // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x =
+    // [-1/16, 1/16]
     //          max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 )
-    double p = HEX_DBL( -, 1, cc33de97a9d7b,  -, 46 ) +
-               (HEX_DBL( -, 1, fffffffff3eb7, -, 2 ) +
-               (HEX_DBL( +, 1, 5555555633ef7, -, 2 ) +
-               (HEX_DBL( -, 1, 00000062c78,   -, 2 ) +
-               (HEX_DBL( +, 1, 9999958a3321,  -, 3 ) +
-               (HEX_DBL( -, 1, 55534ce65c347, -, 3 ) +
-               (HEX_DBL( +, 1, 24957208391a5, -, 3 ) +
-               (HEX_DBL( -, 1, 02287b9a5b4a1, -, 3 ) +
-                HEX_DBL( +, 1, c757d922180ed, -, 4 ) * x)*x)*x)*x)*x)*x)*x)*x;
+    double p = HEX_DBL(-, 1, cc33de97a9d7b, -, 46)
+        + (HEX_DBL(-, 1, fffffffff3eb7, -, 2)
+           + (HEX_DBL(+, 1, 5555555633ef7, -, 2)
+              + (HEX_DBL(-, 1, 00000062c78, -, 2)
+                 + (HEX_DBL(+, 1, 9999958a3321, -, 3)
+                    + (HEX_DBL(-, 1, 55534ce65c347, -, 3)
+                       + (HEX_DBL(+, 1, 24957208391a5, -, 3)
+                          + (HEX_DBL(-, 1, 02287b9a5b4a1, -, 3)
+                             + HEX_DBL(+, 1, c757d922180ed, -, 4) * x)
+                              * x)
+                           * x)
+                        * x)
+                     * x)
+                  * x)
+               * x)
+            * x;
 
     // log(x+1) = x * p(x) + x
     x += x * p;
@@ -1629,22 +1905,23 @@ double reference_log1p( double x )
     return correctionHi + (correctionLo + x);
 }
 
-double reference_logb( double x )
+double reference_logb(double x)
 {
-    union { float f; cl_uint u;} u;
-    u.f = (float) x;
+    union {
+        float f;
+        cl_uint u;
+    } u;
+    u.f = (float)x;
 
     cl_int exponent = (u.u >> 23) & 0xff;
-    if( exponent == 0xff )
-        return x * x;
+    if (exponent == 0xff) return x * x;
 
-    if( exponent == 0 )
-    {   // deal with denormals
+    if (exponent == 0)
+    { // deal with denormals
         u.u = (u.u & 0x007fffff) | 0x3f800000;
         u.f -= 1.0f;
         exponent = (u.u >> 23) & 0xff;
-        if( exponent == 0 )
-            return -INFINITY;
+        if (exponent == 0) return -INFINITY;
 
         return exponent - (127 + 126);
     }
@@ -1652,219 +1929,271 @@ double reference_logb( double x )
     return exponent - 127;
 }
 
-double reference_relaxed_reciprocal(double x)
-{
-  return 1.0f / ((float) x);
-}
+double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); }
 
-double reference_reciprocal( double x )
-{
-  return 1.0 / x;
-}
+double reference_reciprocal(double x) { return 1.0 / x; }
 
-double reference_remainder( double x, double y )
+double reference_remainder(double x, double y)
 {
     int i;
-    return reference_remquo( x, y, &i );
-}
+    return reference_remquo(x, y, &i);
+}
+
+double reference_lgamma(double x)
+{
+    /*
+     * ====================================================
+     * This function is from fdlibm. http://www.netlib.org
+     * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     *
+     */
+
+    static const double // two52 = 4.50359962737049600000e+15, /* 0x43300000,
+                        // 0x00000000 */
+        half = 5.00000000000000000000e-01, /* 0x3FE00000,
+                                              0x00000000 */
+        one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
+        pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
+        a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
+        a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
+        a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
+        a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
+        a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
+        a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
+        a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
+        a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
+        a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
+        a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
+        a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
+        a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
+        tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
+        tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
+        /* tt = -(tail of tf) */
+        tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
+        t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
+        t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
+        t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
+        t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
+        t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
+        t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
+        t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
+        t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
+        t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
+        t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
+        t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
+        t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
+        t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
+        t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
+        t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
+        u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
+        u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
+        u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
+        u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
+        u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
+        u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
+        v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
+        v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
+        v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
+        v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
+        v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
+        s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
+        s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
+        s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
+        s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
+        s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
+        s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
+        s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
+        r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
+        r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
+        r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
+        r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
+        r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
+        r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
+        w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
+        w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
+        w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
+        w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
+        w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
+        w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
+        w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
+
+    static const double zero = 0.00000000000000000000e+00;
+    double t, y, z, nadj, p, p1, p2, p3, q, r, w;
+    cl_int i, hx, lx, ix;
 
-double reference_lgamma( double x)
-{
-/*
- * ====================================================
- * This function is from fdlibm. http://www.netlib.org
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- *
- */
+    union {
+        double d;
+        cl_ulong u;
+    } u;
+    u.d = x;
 
-static const double //two52 = 4.50359962737049600000e+15, /* 0x43300000, 0x00000000 */
-                    half=  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
-                    one =  1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
-                    pi  =  3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
-                    a0  =  7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
-                    a1  =  3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
-                    a2  =  6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
-                    a3  =  2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
-                    a4  =  7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
-                    a5  =  2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
-                    a6  =  1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
-                    a7  =  5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
-                    a8  =  2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
-                    a9  =  1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
-                    a10 =  2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
-                    a11 =  4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
-                    tc  =  1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
-                    tf  = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
-                    /* tt = -(tail of tf) */
-                    tt  = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
-                    t0  =  4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
-                    t1  = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
-                    t2  =  6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
-                    t3  = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
-                    t4  =  1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
-                    t5  = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
-                    t6  =  6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
-                    t7  = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
-                    t8  =  2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
-                    t9  = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
-                    t10 =  8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
-                    t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
-                    t12 =  3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
-                    t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
-                    t14 =  3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
-                    u0  = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
-                    u1  =  6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
-                    u2  =  1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
-                    u3  =  9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
-                    u4  =  2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
-                    u5  =  1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
-                    v1  =  2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
-                    v2  =  2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
-                    v3  =  7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
-                    v4  =  1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
-                    v5  =  3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
-                    s0  = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
-                    s1  =  2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
-                    s2  =  3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
-                    s3  =  1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
-                    s4  =  2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
-                    s5  =  1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
-                    s6  =  3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
-                    r1  =  1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
-                    r2  =  7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
-                    r3  =  1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
-                    r4  =  1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
-                    r5  =  7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
-                    r6  =  7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
-                    w0  =  4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
-                    w1  =  8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
-                    w2  = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
-                    w3  =  7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
-                    w4  = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
-                    w5  =  8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
-                    w6  = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
-
-    static const double zero=  0.00000000000000000000e+00;
-    double t,y,z,nadj,p,p1,p2,p3,q,r,w;
-    cl_int i,hx,lx,ix;
-
-    union{ double d; cl_ulong u;}u; u.d = x;
-
-    hx = (cl_int) (u.u >> 32);
-    lx = (cl_int) (u.u & 0xffffffffULL);
+    hx = (cl_int)(u.u >> 32);
+    lx = (cl_int)(u.u & 0xffffffffULL);
 
     /* purge off +-inf, NaN, +-0, and negative arguments */
-//    *signgamp = 1;
-    ix = hx&0x7fffffff;
-    if(ix>=0x7ff00000) return x*x;
-    if((ix|lx)==0) return INFINITY;
-    if(ix<0x3b900000) {    /* |x|<2**-70, return -log(|x|) */
-        if(hx<0) {
-//            *signgamp = -1;
+    //    *signgamp = 1;
+    ix = hx & 0x7fffffff;
+    if (ix >= 0x7ff00000) return x * x;
+    if ((ix | lx) == 0) return INFINITY;
+    if (ix < 0x3b900000)
+    { /* |x|<2**-70, return -log(|x|) */
+        if (hx < 0)
+        {
+            //            *signgamp = -1;
             return -reference_log(-x);
-        } else return -reference_log(x);
+        }
+        else
+            return -reference_log(x);
     }
-    if(hx<0) {
-        if(ix>=0x43300000)     /* |x|>=2**52, must be -integer */
-        return INFINITY;
+    if (hx < 0)
+    {
+        if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */
+            return INFINITY;
         t = reference_sinpi(x);
-        if(t==zero) return INFINITY; /* -integer */
-        nadj = reference_log(pi/reference_fabs(t*x));
-//        if(t<zero) *signgamp = -1;
+        if (t == zero) return INFINITY; /* -integer */
+        nadj = reference_log(pi / reference_fabs(t * x));
+        //        if(t<zero) *signgamp = -1;
         x = -x;
     }
 
     /* purge off 1 and 2 */
-    if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0;
+    if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0))
+        r = 0;
     /* for x < 2.0 */
-    else if(ix<0x40000000) {
-        if(ix<=0x3feccccc) {     /* lgamma(x) = lgamma(x+1)-log(x) */
-        r = -reference_log(x);
-        if(ix>=0x3FE76944) {y = 1.0-x; i= 0;}
-        else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
-          else {y = x; i=2;}
-        } else {
-          r = zero;
-            if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
-            else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
-        else {y=x-one;i=2;}
+    else if (ix < 0x40000000)
+    {
+        if (ix <= 0x3feccccc)
+        { /* lgamma(x) = lgamma(x+1)-log(x) */
+            r = -reference_log(x);
+            if (ix >= 0x3FE76944)
+            {
+                y = 1.0 - x;
+                i = 0;
+            }
+            else if (ix >= 0x3FCDA661)
+            {
+                y = x - (tc - one);
+                i = 1;
+            }
+            else
+            {
+                y = x;
+                i = 2;
+            }
+        }
+        else
+        {
+            r = zero;
+            if (ix >= 0x3FFBB4C3)
+            {
+                y = 2.0 - x;
+                i = 0;
+            } /* [1.7316,2] */
+            else if (ix >= 0x3FF3B4C4)
+            {
+                y = x - tc;
+                i = 1;
+            } /* [1.23,1.73] */
+            else
+            {
+                y = x - one;
+                i = 2;
+            }
         }
-        switch(i) {
-          case 0:
-        z = y*y;
-        p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
-        p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
-        p  = y*p1+p2;
-        r  += (p-0.5*y); break;
-          case 1:
-        z = y*y;
-        w = z*y;
-        p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12)));    /* parallel comp */
-        p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
-        p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
-        p  = z*p1-(tt-w*(p2+y*p3));
-        r += (tf + p); break;
-          case 2:
-        p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
-        p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
-        r += (-0.5*y + p1/p2);
+        switch (i)
+        {
+            case 0:
+                z = y * y;
+                p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+                p2 = z
+                    * (a1
+                       + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+                p = y * p1 + p2;
+                r += (p - 0.5 * y);
+                break;
+            case 1:
+                z = y * y;
+                w = z * y;
+                p1 = t0
+                    + w
+                        * (t3
+                           + w * (t6 + w * (t9 + w * t12))); /* parallel comp */
+                p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+                p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+                p = z * p1 - (tt - w * (p2 + y * p3));
+                r += (tf + p);
+                break;
+            case 2:
+                p1 = y
+                    * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+                p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+                r += (-0.5 * y + p1 / p2);
         }
     }
-    else if(ix<0x40200000) {             /* x < 8.0 */
+    else if (ix < 0x40200000)
+    { /* x < 8.0 */
         i = (int)x;
         t = zero;
-        y = x-(double)i;
-        p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
-        q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
-        r = half*y+p/q;
-        z = one;    /* lgamma(1+s) = log(s) + lgamma(s) */
-        switch(i) {
-        case 7: z *= (y+6.0);    /* FALLTHRU */
-        case 6: z *= (y+5.0);    /* FALLTHRU */
-        case 5: z *= (y+4.0);    /* FALLTHRU */
-        case 4: z *= (y+3.0);    /* FALLTHRU */
-        case 3: z *= (y+2.0);    /* FALLTHRU */
-            r += reference_log(z); break;
+        y = x - (double)i;
+        p = y
+            * (s0
+               + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
+        q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+        r = half * y + p / q;
+        z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
+        switch (i)
+        {
+            case 7: z *= (y + 6.0); /* FALLTHRU */
+            case 6: z *= (y + 5.0); /* FALLTHRU */
+            case 5: z *= (y + 4.0); /* FALLTHRU */
+            case 4: z *= (y + 3.0); /* FALLTHRU */
+            case 3:
+                z *= (y + 2.0); /* FALLTHRU */
+                r += reference_log(z);
+                break;
         }
-    /* 8.0 <= x < 2**58 */
-    } else if (ix < 0x43900000) {
+        /* 8.0 <= x < 2**58 */
+    }
+    else if (ix < 0x43900000)
+    {
         t = reference_log(x);
-        z = one/x;
-        y = z*z;
-        w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6)))));
-        r = (x-half)*(t-one)+w;
-    } else
-    /* 2**58 <= x <= inf */
-        r =  x*(reference_log(x)-one);
-    if(hx<0) r = nadj - r;
+        z = one / x;
+        y = z * z;
+        w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+        r = (x - half) * (t - one) + w;
+    }
+    else
+        /* 2**58 <= x <= inf */
+        r = x * (reference_log(x) - one);
+    if (hx < 0) r = nadj - r;
     return r;
-
 }
 
 #endif // _MSC_VER
 
-double reference_assignment( double x ){ return x; }
+double reference_assignment(double x) { return x; }
 
-int reference_not( double x )
+int reference_not(double x)
 {
-  int r = !x;
-  return r;
+    int r = !x;
+    return r;
 }
 
 #pragma mark -
 #pragma mark Double testing
 
 #ifndef M_PIL
-    #define M_PIL        3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
+#define M_PIL                                                                  \
+    3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
 #endif
 
-static long double reduce1l( long double x );
+static long double reduce1l(long double x);
 
 #ifdef __PPC__
 // Since long double on PPC is really extended precision double arithmetic
@@ -1873,36 +2202,35 @@ static long double reduce1l( long double x );
 // such that reduction algorithm used for other architectures will not work.
 // Instead and alternate reduction method is used.
 
-static long double reduce1l( long double x )
+static long double reduce1l(long double x)
 {
-  union {
-    long double ld;
-    double d[2];
-  } u;
+    union {
+        long double ld;
+        double d[2];
+    } u;
 
-  // Reduce the high and low halfs separately.
-  u.ld = x;
-  return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
+    // Reduce the high and low halfs separately.
+    u.ld = x;
+    return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
 }
 
 #else // !__PPC__
 
-static long double reduce1l( long double x )
+static long double reduce1l(long double x)
 {
     static long double unit_exp = 0;
-    if( 0.0L == unit_exp )
-        unit_exp = scalbnl( 1.0L, LDBL_MANT_DIG);
+    if (0.0L == unit_exp) unit_exp = scalbnl(1.0L, LDBL_MANT_DIG);
 
-    if( reference_fabsl(x) >= unit_exp )
+    if (reference_fabsl(x) >= unit_exp)
     {
-        if( reference_fabsl(x) == INFINITY )
-            return cl_make_nan();
+        if (reference_fabsl(x) == INFINITY) return cl_make_nan();
 
-        return 0.0L; //we patch up the sign for sinPi and cosPi later, since they need different signs
+        return 0.0L; // we patch up the sign for sinPi and cosPi later, since
+                     // they need different signs
     }
 
     // Find the nearest multiple of 2
-    const long double r = reference_copysignl( unit_exp, x );
+    const long double r = reference_copysignl(unit_exp, x);
     long double z = x + r;
     z -= r;
 
@@ -1911,19 +2239,31 @@ static long double reduce1l( long double x )
 }
 #endif // __PPC__
 
-long double reference_acospil( long double x){  return reference_acosl( x ) / M_PIL;    }
-long double reference_asinpil( long double x){  return reference_asinl( x ) / M_PIL;    }
-long double reference_atanpil( long double x){  return reference_atanl( x ) / M_PIL;    }
-long double reference_atan2pil( long double y, long double x){ return reference_atan2l( y, x) / M_PIL; }
-long double reference_cospil( long double x)
+long double reference_acospil(long double x)
+{
+    return reference_acosl(x) / M_PIL;
+}
+long double reference_asinpil(long double x)
+{
+    return reference_asinl(x) / M_PIL;
+}
+long double reference_atanpil(long double x)
+{
+    return reference_atanl(x) / M_PIL;
+}
+long double reference_atan2pil(long double y, long double x)
+{
+    return reference_atan2l(y, x) / M_PIL;
+}
+long double reference_cospil(long double x)
 {
-    if( reference_fabsl(x) >= HEX_LDBL( +, 1, 0, +, 54 ) )
+    if (reference_fabsl(x) >= HEX_LDBL(+, 1, 0, +, 54))
     {
-        if( reference_fabsl(x) == INFINITY )
-            return cl_make_nan();
+        if (reference_fabsl(x) == INFINITY) return cl_make_nan();
 
-        //Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53.
-        //However, when starting with single precision inputs, there will be no odd values.
+        // Note this probably fails for odd values between 0x1.0p52 and
+        // 0x1.0p53. However, when starting with single precision inputs, there
+        // will be no odd values.
 
         return 1.0L;
     }
@@ -1935,9 +2275,9 @@ long double reference_cospil( long double x)
     // phase adjust
     double xhi = 0.0;
     double xlo = 0.0;
-    xhi = (double) x + 0.5;
+    xhi = (double)x + 0.5;
 
-    if(reference_fabsl(x) > 0.5L)
+    if (reference_fabsl(x) > 0.5L)
     {
         xlo = xhi - x;
         xlo = 0.5 - xlo;
@@ -1949,61 +2289,69 @@ long double reference_cospil( long double x)
     }
 
     // reduce to [-0.5, 0.5]
-    if( xhi < -0.5 )
+    if (xhi < -0.5)
     {
         xhi = -1.0 - xhi;
         xlo = -xlo;
     }
-    else if ( xhi > 0.5 )
+    else if (xhi > 0.5)
     {
         xhi = 1.0 - xhi;
         xlo = -xlo;
     }
 
     // cosPi zeros are all +0
-    if( xhi == 0.0 && xlo == 0.0 )
-        return 0.0;
+    if (xhi == 0.0 && xlo == 0.0) return 0.0;
 
     xhi *= M_PI;
     xlo *= M_PI;
 
     xhi += xlo;
 
-    return reference_sinl( xhi );
+    return reference_sinl(xhi);
 
 #else
     // phase adjust
     x += 0.5L;
 
     // reduce to [-0.5, 0.5]
-    if( x < -0.5L )
+    if (x < -0.5L)
         x = -1.0L - x;
-    else if ( x > 0.5L )
+    else if (x > 0.5L)
         x = 1.0L - x;
 
     // cosPi zeros are all +0
-    if( x == 0.0L )
-        return 0.0L;
+    if (x == 0.0L) return 0.0L;
 
-    return reference_sinl( x * M_PIL );
+    return reference_sinl(x * M_PIL);
 #endif
 }
 
-long double reference_dividel( long double x, long double y)
+long double reference_dividel(long double x, long double y)
 {
     double dx = x;
     double dy = y;
-    return dx/dy;
+    return dx / dy;
 }
 
-typedef struct{ double hi, lo; } double_double;
+typedef struct
+{
+    double hi, lo;
+} double_double;
 
-// Split doubles_double into a series of consecutive 26-bit precise doubles and a remainder.
-// Note for later -- for multiplication, it might be better to split each double into a power of two and two 26 bit portions
-//                      multiplication of a double double by a known power of two is cheap. The current approach causes some inexact arithmetic in mul_dd.
-static inline void split_dd( double_double x, double_double *hi, double_double *lo )
+// Split doubles_double into a series of consecutive 26-bit precise doubles and
+// a remainder. Note for later -- for multiplication, it might be better to
+// split each double into a power of two and two 26 bit portions
+//                      multiplication of a double double by a known power of
+//                      two is cheap. The current approach causes some inexact
+//                      arithmetic in mul_dd.
+static inline void split_dd(double_double x, double_double *hi,
+                            double_double *lo)
 {
-    union{ double d; cl_ulong u;}u;
+    union {
+        double d;
+        cl_ulong u;
+    } u;
     u.d = x.hi;
     u.u &= 0xFFFFFFFFF8000000ULL;
     hi->hi = u.d;
@@ -2025,10 +2373,10 @@ static inline void split_dd( double_double x, double_double *hi, double_double *
     lo->lo = x.hi + x.lo;
 }
 
-static inline double_double accum_d( double_double a, double b )
+static inline double_double accum_d(double_double a, double b)
 {
     double temp;
-    if( fabs(b) > fabs(a.hi) )
+    if (fabs(b) > fabs(a.hi))
     {
         temp = a.hi;
         a.hi += b;
@@ -2041,47 +2389,45 @@ static inline double_double accum_d( double_double a, double b )
         a.lo += b - (a.hi - temp);
     }
 
-    if( isnan( a.lo ) )
-        a.lo = 0.0;
+    if (isnan(a.lo)) a.lo = 0.0;
 
     return a;
 }
 
-static inline double_double add_dd( double_double a, double_double b )
+static inline double_double add_dd(double_double a, double_double b)
 {
-    double_double r = {-0.0 -0.0 };
+    double_double r = { -0.0 - 0.0 };
 
-    if( isinf(a.hi) || isinf( b.hi )  ||
-       isnan(a.hi) || isnan( b.hi )  ||
-       0.0 == a.hi || 0.0 == b.hi )
+    if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi
+        || 0.0 == b.hi)
     {
         r.hi = a.hi + b.hi;
         r.lo = a.lo + b.lo;
-        if( isnan( r.lo ) )
-            r.lo = 0.0;
+        if (isnan(r.lo)) r.lo = 0.0;
         return r;
     }
 
-    //merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|, |b.hi| > |b.lo|, so we don't have to do the first merge pass
+    // merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|,
+    // |b.hi| > |b.lo|, so we don't have to do the first merge pass
     double terms[4] = { a.hi, b.hi, a.lo, b.lo };
     double temp;
 
-    //Sort hi terms
-    if( fabs(terms[0]) < fabs(terms[1]) )
+    // Sort hi terms
+    if (fabs(terms[0]) < fabs(terms[1]))
     {
         temp = terms[0];
         terms[0] = terms[1];
         terms[1] = temp;
     }
-    //sort lo terms
-    if( fabs(terms[2]) < fabs(terms[3]) )
+    // sort lo terms
+    if (fabs(terms[2]) < fabs(terms[3]))
     {
         temp = terms[2];
         terms[2] = terms[3];
         terms[3] = temp;
     }
     // Fix case where small high term is less than large low term
-    if( fabs(terms[1]) < fabs(terms[2]) )
+    if (fabs(terms[1]) < fabs(terms[2]))
     {
         temp = terms[1];
         terms[1] = terms[2];
@@ -2104,42 +2450,40 @@ static inline double_double add_dd( double_double a, double_double b )
     temp = r.hi;
     r.hi += r.lo;
     r.lo = r.lo - (r.hi - temp);
-    if( isnan( r.lo ) )
-        r.lo = 0.0;
+    if (isnan(r.lo)) r.lo = 0.0;
 
     return r;
 }
 
-static inline double_double mul_dd( double_double a, double_double b )
+static inline double_double mul_dd(double_double a, double_double b)
 {
-    double_double result = {-0.0,-0.0};
+    double_double result = { -0.0, -0.0 };
 
     // Inf, nan and 0
-    if( isnan( a.hi ) || isnan( b.hi ) ||
-       isinf( a.hi ) || isinf( b.hi ) ||
-       0.0 == a.hi || 0.0 == b.hi )
+    if (isnan(a.hi) || isnan(b.hi) || isinf(a.hi) || isinf(b.hi) || 0.0 == a.hi
+        || 0.0 == b.hi)
     {
         result.hi = a.hi * b.hi;
         return result;
     }
 
     double_double ah, al, bh, bl;
-    split_dd( a, &ah, &al );
-    split_dd( b, &bh, &bl );
-
-    double p0 = ah.hi * bh.hi;        // exact    (52 bits in product) 0
-    double p1 = ah.hi * bh.lo;        // exact    (52 bits in product) 26
-    double p2 = ah.lo * bh.hi;        // exact    (52 bits in product) 26
-    double p3 = ah.lo * bh.lo;        // exact    (52 bits in product) 52
-    double p4 = al.hi * bh.hi;        // exact    (52 bits in product) 52
-    double p5 = al.hi * bh.lo;        // exact    (52 bits in product) 78
-    double p6 = al.lo * bh.hi;        // inexact  (54 bits in product) 78
-    double p7 = al.lo * bh.lo;        // inexact  (54 bits in product) 104
-    double p8 = ah.hi * bl.hi;        // exact    (52 bits in product) 52
-    double p9 = ah.hi * bl.lo;        // inexact  (54 bits in product) 78
-    double pA = ah.lo * bl.hi;        // exact    (52 bits in product) 78
-    double pB = ah.lo * bl.lo;        // inexact  (54 bits in product) 104
-    double pC = al.hi * bl.hi;        // exact    (52 bits in product) 104
+    split_dd(a, &ah, &al);
+    split_dd(b, &bh, &bl);
+
+    double p0 = ah.hi * bh.hi; // exact    (52 bits in product) 0
+    double p1 = ah.hi * bh.lo; // exact    (52 bits in product) 26
+    double p2 = ah.lo * bh.hi; // exact    (52 bits in product) 26
+    double p3 = ah.lo * bh.lo; // exact    (52 bits in product) 52
+    double p4 = al.hi * bh.hi; // exact    (52 bits in product) 52
+    double p5 = al.hi * bh.lo; // exact    (52 bits in product) 78
+    double p6 = al.lo * bh.hi; // inexact  (54 bits in product) 78
+    double p7 = al.lo * bh.lo; // inexact  (54 bits in product) 104
+    double p8 = ah.hi * bl.hi; // exact    (52 bits in product) 52
+    double p9 = ah.hi * bl.lo; // inexact  (54 bits in product) 78
+    double pA = ah.lo * bl.hi; // exact    (52 bits in product) 78
+    double pB = ah.lo * bl.lo; // inexact  (54 bits in product) 104
+    double pC = al.hi * bl.hi; // exact    (52 bits in product) 104
     // the last 3 terms are two low to appear in the result
 
 
@@ -2169,46 +2513,60 @@ static inline double_double mul_dd( double_double a, double_double b )
 
     return result;
 #else
-    // take advantage of the known relative magnitudes of the partial products to avoid some sorting
-    // Combine 2**-78 and 2**-104 terms. Here we are a bit sloppy about canonicalizing the double_doubles
+    // take advantage of the known relative magnitudes of the partial products
+    // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit
+    // sloppy about canonicalizing the double_doubles
     double_double t0 = { pA, pC };
     double_double t1 = { p9, pB };
     double_double t2 = { p6, p7 };
     double temp0, temp1, temp2;
 
-    t0 = accum_d( t0, p5 );  // there is an extra 2**-78 term to deal with
-
-    // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the double_doubles
-    temp0 = t0.hi;      temp1 = t1.hi;      temp2 = t2.hi;
-    t0.hi += p3;        t1.hi += p4;        t2.hi += p8;
-    temp0 -= t0.hi-p3;  temp1 -= t1.hi-p4;  temp2 -= t2.hi - p8;
-    t0.lo += temp0;     t1.lo += temp1;     t2.lo += temp2;
-
-    // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the double_doubles
-    temp1 = t1.hi;      temp2 = t2.hi;
-    t1.hi += p1;        t2.hi += p2;
-    temp1 -= t1.hi-p1;  temp2 -= t2.hi - p2;
-    t1.lo += temp1;     t2.lo += temp2;
+    t0 = accum_d(t0, p5); // there is an extra 2**-78 term to deal with
+
+    // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the
+    // double_doubles
+    temp0 = t0.hi;
+    temp1 = t1.hi;
+    temp2 = t2.hi;
+    t0.hi += p3;
+    t1.hi += p4;
+    t2.hi += p8;
+    temp0 -= t0.hi - p3;
+    temp1 -= t1.hi - p4;
+    temp2 -= t2.hi - p8;
+    t0.lo += temp0;
+    t1.lo += temp1;
+    t2.lo += temp2;
+
+    // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the
+    // double_doubles
+    temp1 = t1.hi;
+    temp2 = t2.hi;
+    t1.hi += p1;
+    t2.hi += p2;
+    temp1 -= t1.hi - p1;
+    temp2 -= t2.hi - p2;
+    t1.lo += temp1;
+    t2.lo += temp2;
 
     // Combine accumulators to get the low bits of result
-    t1 = add_dd( t1, add_dd( t2, t0 ) );
+    t1 = add_dd(t1, add_dd(t2, t0));
 
     // Add in MSB's, and round to precision
-    return accum_d( t1, p0 );  // canonicalizes
+    return accum_d(t1, p0); // canonicalizes
 #endif
-
 }
 
 
-long double reference_exp10l( long double z )
+long double reference_exp10l(long double z)
 {
-    const double_double log2_10 = { HEX_DBL( +, 1, a934f0979a371, +, 1 ), HEX_DBL( +, 1, 7f2495fb7fa6d, -, 53 ) };
+    const double_double log2_10 = { HEX_DBL(+, 1, a934f0979a371, +, 1),
+                                    HEX_DBL(+, 1, 7f2495fb7fa6d, -, 53) };
     double_double x;
     int j;
 
     // Handle NaNs
-    if( isnan(z) )
-        return z;
+    if (isnan(z)) return z;
 
     // init x
     x.hi = z;
@@ -2217,172 +2575,195 @@ long double reference_exp10l( long double z )
 
     // 10**x = exp2( x * log2(10) )
 
-    x = mul_dd( x, log2_10);    // x * log2(10)
+    x = mul_dd(x, log2_10); // x * log2(10)
 
-    //Deal with overflow and underflow for exp2(x) stage next
-    if( x.hi >= 1025 )
-        return INFINITY;
+    // Deal with overflow and underflow for exp2(x) stage next
+    if (x.hi >= 1025) return INFINITY;
 
-    if( x.hi < -1075-24 )
-        return +0.0;
+    if (x.hi < -1075 - 24) return +0.0;
 
     // find nearest integer to x
-    int i = (int) rint(x.hi);
+    int i = (int)rint(x.hi);
 
     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
     x.hi -= i;
 
-    // We could attempt to find a minimax polynomial for exp2(x) over the range x = [-0.5, 0.5].
-    // However, this would converge very slowly near the extrema, where 0.5**n is not a lot different
-    // from 0.5**(n+1), thereby requiring something like a 20th order polynomial to get 53 + 24 bits
-    // of precision. Instead we further reduce the range to [-1/32, 1/32] by observing that
+    // We could attempt to find a minimax polynomial for exp2(x) over the range
+    // x = [-0.5, 0.5]. However, this would converge very slowly near the
+    // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
+    // requiring something like a 20th order polynomial to get 53 + 24 bits of
+    // precision. Instead we further reduce the range to [-1/32, 1/32] by
+    // observing that
     //
     //  2**(a+b) = 2**a * 2**b
     //
-    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and reduce the range
-    // of x to [-1/32, 1/32] by subtracting away the nearest value of n/16 from x.
-    const double_double corrections[17] =
-    {
-        { HEX_DBL( +, 1, 6a09e667f3bcd, -, 1 ), HEX_DBL( -, 1, bdd3413b26456, -, 55 ) },
-        { HEX_DBL( +, 1, 7a11473eb0187, -, 1 ), HEX_DBL( -, 1, 41577ee04992f, -, 56 ) },
-        { HEX_DBL( +, 1, 8ace5422aa0db, -, 1 ), HEX_DBL( +, 1, 6e9f156864b27, -, 55 ) },
-        { HEX_DBL( +, 1, 9c49182a3f09,  -, 1 ), HEX_DBL( +, 1, c7c46b071f2be, -, 57 ) },
-        { HEX_DBL( +, 1, ae89f995ad3ad, -, 1 ), HEX_DBL( +, 1, 7a1cd345dcc81, -, 55 ) },
-        { HEX_DBL( +, 1, c199bdd85529c, -, 1 ), HEX_DBL( +, 1, 11065895048dd, -, 56 ) },
-        { HEX_DBL( +, 1, d5818dcfba487, -, 1 ), HEX_DBL( +, 1, 2ed02d75b3707, -, 56 ) },
-        { HEX_DBL( +, 1, ea4afa2a490da, -, 1 ), HEX_DBL( -, 1, e9c23179c2893, -, 55 ) },
-        { HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ) },
-        { HEX_DBL( +, 1, 0b5586cf9890f, +, 0 ), HEX_DBL( +, 1, 8a62e4adc610b, -, 54 ) },
-        { HEX_DBL( +, 1, 172b83c7d517b, +, 0 ), HEX_DBL( -, 1, 19041b9d78a76, -, 55 ) },
-        { HEX_DBL( +, 1, 2387a6e756238, +, 0 ), HEX_DBL( +, 1, 9b07eb6c70573, -, 54 ) },
-        { HEX_DBL( +, 1, 306fe0a31b715, +, 0 ), HEX_DBL( +, 1, 6f46ad23182e4, -, 55 ) },
-        { HEX_DBL( +, 1, 3dea64c123422, +, 0 ), HEX_DBL( +, 1, ada0911f09ebc, -, 55 ) },
-        { HEX_DBL( +, 1, 4bfdad5362a27, +, 0 ), HEX_DBL( +, 1, d4397afec42e2, -, 56 ) },
-        { HEX_DBL( +, 1, 5ab07dd485429, +, 0 ), HEX_DBL( +, 1, 6324c054647ad, -, 54 ) },
-        { HEX_DBL( +, 1, 6a09e667f3bcd, +, 0 ), HEX_DBL( -, 1, bdd3413b26456, -, 54 ) }
+    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
+    // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
+    // value of n/16 from x.
+    const double_double corrections[17] = {
+        { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
+          HEX_DBL(-, 1, bdd3413b26456, -, 55) },
+        { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
+          HEX_DBL(-, 1, 41577ee04992f, -, 56) },
+        { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
+          HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
+        { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
+          HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
+        { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
+          HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
+        { HEX_DBL(+, 1, c199bdd85529c, -, 1),
+          HEX_DBL(+, 1, 11065895048dd, -, 56) },
+        { HEX_DBL(+, 1, d5818dcfba487, -, 1),
+          HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
+        { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
+          HEX_DBL(-, 1, e9c23179c2893, -, 55) },
+        { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+        { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
+          HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
+        { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
+          HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
+        { HEX_DBL(+, 1, 2387a6e756238, +, 0),
+          HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
+        { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
+          HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
+        { HEX_DBL(+, 1, 3dea64c123422, +, 0),
+          HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
+        { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
+          HEX_DBL(+, 1, d4397afec42e2, -, 56) },
+        { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
+          HEX_DBL(+, 1, 6324c054647ad, -, 54) },
+        { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
+          HEX_DBL(-, 1, bdd3413b26456, -, 54) }
 
     };
-    int index = (int) rint( x.hi * 16.0 );
-    x.hi -= (double) index * 0.0625;
+    int index = (int)rint(x.hi * 16.0);
+    x.hi -= (double)index * 0.0625;
 
     // canonicalize x
     double temp = x.hi;
     x.hi += x.lo;
     x.lo -= x.hi - temp;
 
-    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max Error: 2 * 0x1.e112p-87
-    const double_double c[] = {
-        {HEX_DBL( +, 1, 62e42fefa39ef, -,  1 ), HEX_DBL( +, 1, abc9e3ac1d244, -, 56 )},
-        {HEX_DBL( +, 1, ebfbdff82c58f, -,  3 ), HEX_DBL( -, 1, 5e4987a631846, -, 57 )},
-        {HEX_DBL( +, 1, c6b08d704a0c,  -,  5 ), HEX_DBL( -, 1, d323200a05713, -, 59 )},
-        {HEX_DBL( +, 1, 3b2ab6fba4e7a, -,  7 ), HEX_DBL( +, 1, c5ee8f8b9f0c1, -, 63 )},
-        {HEX_DBL( +, 1, 5d87fe78a672a, -, 10 ), HEX_DBL( +, 1, 884e5e5cc7ecc, -, 64 )},
-        {HEX_DBL( +, 1, 430912f7e8373, -, 13 ), HEX_DBL( +, 1, 4f1b59514a326, -, 67 )},
-        {HEX_DBL( +, 1, ffcbfc5985e71, -, 17 ), HEX_DBL( -, 1, db7d6a0953b78, -, 71 )},
-        {HEX_DBL( +, 1, 62c150eb16465, -, 20 ), HEX_DBL( +, 1, e0767c2d7abf5, -, 80 )},
-        {HEX_DBL( +, 1, b52502b5e953,  -, 24 ), HEX_DBL( +, 1, 6797523f944bc, -, 78 )}
-    };
-    size_t count = sizeof( c ) / sizeof( c[0] );
+    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
+    // Error: 2 * 0x1.e112p-87
+    const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
+                                  HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
+                                { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
+                                  HEX_DBL(-, 1, 5e4987a631846, -, 57) },
+                                { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
+                                  HEX_DBL(-, 1, d323200a05713, -, 59) },
+                                { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
+                                  HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
+                                { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
+                                  HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
+                                { HEX_DBL(+, 1, 430912f7e8373, -, 13),
+                                  HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
+                                { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
+                                  HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
+                                { HEX_DBL(+, 1, 62c150eb16465, -, 20),
+                                  HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
+                                { HEX_DBL(+, 1, b52502b5e953, -, 24),
+                                  HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
+    size_t count = sizeof(c) / sizeof(c[0]);
 
     // Do polynomial
-    double_double r = c[count-1];
-    for( j = (int) count-2; j >= 0; j-- )
-        r = add_dd( c[j], mul_dd( r, x ) );
+    double_double r = c[count - 1];
+    for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
 
     // unwind approximation
-    r = mul_dd( r, x );     // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
+    r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
 
     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
     //  exp2(x) = (r + 1) * correction = r * correction + correction
-    r = mul_dd( r, corrections[index+8] );
-    r = add_dd( r, corrections[index+8] );
+    r = mul_dd(r, corrections[index + 8]);
+    r = add_dd(r, corrections[index + 8]);
 
-// Format result for output:
+    // Format result for output:
 
     // Get mantissa
-    long double m = ((long double) r.hi + (long double) r.lo );
+    long double m = ((long double)r.hi + (long double)r.lo);
 
     // Handle a pesky overflow cases when long double = double
-    if( i > 512 )
+    if (i > 512)
     {
-        m *=  HEX_DBL( +, 1, 0, +, 512 );
+        m *= HEX_DBL(+, 1, 0, +, 512);
         i -= 512;
     }
-    else if( i < -512 )
+    else if (i < -512)
     {
-        m *= HEX_DBL( +, 1, 0, -, 512 );
+        m *= HEX_DBL(+, 1, 0, -, 512);
         i += 512;
     }
 
-    return m * ldexpl( 1.0L, i );
+    return m * ldexpl(1.0L, i);
 }
 
 
-static double fallback_frexp( double x, int *iptr )
+static double fallback_frexp(double x, int *iptr)
 {
     cl_ulong u, v;
     double fu, fv;
 
-    memcpy( &u, &x, sizeof(u));
+    memcpy(&u, &x, sizeof(u));
 
-    cl_ulong exponent = u &  0x7ff0000000000000ULL;
+    cl_ulong exponent = u & 0x7ff0000000000000ULL;
     cl_ulong mantissa = u & ~0x7ff0000000000000ULL;
 
     // add 1 to the exponent
     exponent += 0x0010000000000000ULL;
 
-    if( (cl_long) exponent < (cl_long) 0x0020000000000000LL )
+    if ((cl_long)exponent < (cl_long)0x0020000000000000LL)
     { // subnormal, NaN, Inf
         mantissa |= 0x3fe0000000000000ULL;
 
         v = mantissa & 0xfff0000000000000ULL;
         u = mantissa;
-        memcpy( &fv, &v, sizeof(v));
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fv, &v, sizeof(v));
+        memcpy(&fu, &u, sizeof(u));
 
         fu -= fv;
 
-        memcpy( &v, &fv, sizeof(v));
-        memcpy( &u, &fu, sizeof(u));
+        memcpy(&v, &fv, sizeof(v));
+        memcpy(&u, &fu, sizeof(u));
 
-        exponent = u &  0x7ff0000000000000ULL;
+        exponent = u & 0x7ff0000000000000ULL;
         mantissa = u & ~0x7ff0000000000000ULL;
 
-        *iptr = (exponent >> 52) + (-1022 + 1 -1022);
+        *iptr = (exponent >> 52) + (-1022 + 1 - 1022);
         u = mantissa | 0x3fe0000000000000ULL;
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fu, &u, sizeof(u));
         return fu;
     }
 
     *iptr = (exponent >> 52) - 1023;
     u = mantissa | 0x3fe0000000000000ULL;
-    memcpy( &fu, &u, sizeof(u));
+    memcpy(&fu, &u, sizeof(u));
     return fu;
 }
 
 // Assumes zeros, infinities and NaNs handed elsewhere
-static inline int extract( double x, cl_ulong *mant );
-static inline int extract( double x, cl_ulong *mant )
+static inline int extract(double x, cl_ulong *mant);
+static inline int extract(double x, cl_ulong *mant)
 {
-    static double (*frexpp)(double, int*) = NULL;
+    static double (*frexpp)(double, int *) = NULL;
     int e;
 
     // verify that frexp works properly
-    if( NULL == frexpp )
+    if (NULL == frexpp)
     {
-        if( 0.5 == frexp( HEX_DBL( +, 1, 0, -, 1030 ), &e ) && e == -1029 )
+        if (0.5 == frexp(HEX_DBL(+, 1, 0, -, 1030), &e) && e == -1029)
             frexpp = frexp;
         else
             frexpp = fallback_frexp;
     }
 
-    *mant = (cl_ulong) (HEX_DBL( +, 1, 0, +, 64 ) * fabs( frexpp( x, &e )));
+    *mant = (cl_ulong)(HEX_DBL(+, 1, 0, +, 64) * fabs(frexpp(x, &e)));
     return e - 1;
 }
 
 // Return 128-bit product of a*b  as (hi << 64) + lo
-static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo );
-static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo )
+static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo);
+static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
 {
     cl_ulong alo = a & 0xffffffffULL;
     cl_ulong ahi = a >> 32;
@@ -2393,16 +2774,22 @@ static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo )
     cl_ulong ahiblo = ahi * blo;
     cl_ulong ahibhi = ahi * bhi;
 
-    alobhi += (aloblo >> 32) + (ahiblo & 0xffffffffULL);  // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   = (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
-    *hi = ahibhi + (alobhi >> 32) + (ahiblo >> 32);       // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   = (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
+    alobhi += (aloblo >> 32)
+        + (ahiblo
+           & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
+                             // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
+    *hi = ahibhi + (alobhi >> 32)
+        + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
+                          // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
     *lo = (aloblo & 0xffffffffULL) | (alobhi << 32);
 }
 
 // Move the most significant non-zero bit to the MSB
-// Note: not general. Only works if the most significant non-zero bit is at MSB-1
-static inline void renormalize( cl_ulong *hi, cl_ulong *lo, int *exponent )
+// Note: not general. Only works if the most significant non-zero bit is at
+// MSB-1
+static inline void renormalize(cl_ulong *hi, cl_ulong *lo, int *exponent)
 {
-    if( 0 == (0x8000000000000000ULL & *hi ))
+    if (0 == (0x8000000000000000ULL & *hi))
     {
         *hi <<= 1;
         *hi |= *lo >> 63;
@@ -2411,74 +2798,84 @@ static inline void renormalize( cl_ulong *hi, cl_ulong *lo, int *exponent )
     }
 }
 
-static double round_to_nearest_even_double( cl_ulong hi, cl_ulong lo, int exponent );
-static double round_to_nearest_even_double( cl_ulong hi, cl_ulong lo, int exponent )
+static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
+                                           int exponent);
+static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
+                                           int exponent)
 {
-    union{ cl_ulong u; cl_double d;} u;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } u;
 
     // edges
-    if( exponent > 1023 )        return INFINITY;
-    if( exponent == -1075 && (hi | (lo!=0)) > 0x8000000000000000ULL )
-        return HEX_DBL( +, 1, 0, -, 1074 );
-    if( exponent <= -1075 )       return 0.0;
+    if (exponent > 1023) return INFINITY;
+    if (exponent == -1075 && (hi | (lo != 0)) > 0x8000000000000000ULL)
+        return HEX_DBL(+, 1, 0, -, 1074);
+    if (exponent <= -1075) return 0.0;
 
-    //Figure out which bits go where
+    // Figure out which bits go where
     int shift = 11;
-    if( exponent < -1022 )
+    if (exponent < -1022)
     {
-        shift -= 1022 + exponent;               // subnormal: shift is not 52
-        exponent = -1023;                       //              set exponent to 0
+        shift -= 1022 + exponent; // subnormal: shift is not 52
+        exponent = -1023; //              set exponent to 0
     }
     else
-        hi &= 0x7fffffffffffffffULL;           // normal: leading bit is implicit. Remove it.
+        hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                     // it.
 
     // Assemble the double (round toward zero)
-    u.u = (hi >> shift) | ((cl_ulong) (exponent + 1023) << 52);
+    u.u = (hi >> shift) | ((cl_ulong)(exponent + 1023) << 52);
 
     // put a representation of the residual bits into hi
-    hi <<= (64-shift);
+    hi <<= (64 - shift);
     hi |= lo >> shift;
-    lo <<= (64-shift );
+    lo <<= (64 - shift);
     hi |= lo != 0;
 
-    //round to nearest, ties to even
-    if( hi < 0x8000000000000000ULL )    return u.d;
-    if( hi == 0x8000000000000000ULL )   u.u += u.u & 1ULL;
-    else                                u.u++;
+    // round to nearest, ties to even
+    if (hi < 0x8000000000000000ULL) return u.d;
+    if (hi == 0x8000000000000000ULL)
+        u.u += u.u & 1ULL;
+    else
+        u.u++;
 
     return u.d;
 }
 
-// Shift right.  Bits lost on the right will be OR'd together and OR'd with the LSB
-static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift );
-static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift )
+// Shift right.  Bits lost on the right will be OR'd together and OR'd with the
+// LSB
+static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo,
+                                          int shift);
+static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
 {
     cl_ulong sticky = 0;
     cl_ulong h = *hi;
     cl_ulong l = *lo;
 
-    if( shift >= 64 )
+    if (shift >= 64)
     {
         shift -= 64;
         sticky = 0 != lo;
         l = h;
         h = 0;
-        if( shift >= 64 )
+        if (shift >= 64)
         {
             sticky |= (0 != l);
             l = 0;
         }
         else
         {
-            sticky |= (0 != (l << (64-shift)));
+            sticky |= (0 != (l << (64 - shift)));
             l >>= shift;
         }
     }
     else
     {
-        sticky |= (0 != (l << (64-shift)));
+        sticky |= (0 != (l << (64 - shift)));
         l >>= shift;
-        l |=  h << (64-shift);
+        l |= h << (64 - shift);
         h >>= shift;
     }
 
@@ -2487,9 +2884,12 @@ static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift
 }
 
 // 128-bit add  of ((*hi << 64) + *lo) + ((chi << 64) + clo)
-// If the 129 bit result doesn't fit, bits lost off the right end will be OR'd with the LSB
-static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong clo, int *exp );
-static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong clo, int *exponent )
+// If the 129 bit result doesn't fit, bits lost off the right end will be OR'd
+// with the LSB
+static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
+                          cl_ulong clo, int *exp);
+static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
+                          cl_ulong clo, int *exponent)
 {
     cl_ulong carry, carry2;
     // extended precision add
@@ -2497,15 +2897,16 @@ static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong cl
     chi = add_carry(*hi, chi, &carry2);
     chi = add_carry(chi, carry, &carry);
 
-    //If we overflowed the 128 bit result
-    if( carry || carry2 )
+    // If we overflowed the 128 bit result
+    if (carry || carry2)
     {
-        carry = clo & 1;                        // set aside low bit
-        clo >>= 1;                              // right shift low 1
-        clo |= carry;                           // or back in the low bit, so we don't come to believe this is an exact half way case for rounding
-        clo |= chi << 63;                       // move lowest high bit into highest bit of lo
-        chi >>= 1;                              // right shift hi
-        chi |= 0x8000000000000000ULL;           // move the carry bit into hi.
+        carry = clo & 1; // set aside low bit
+        clo >>= 1; // right shift low 1
+        clo |= carry; // or back in the low bit, so we don't come to believe
+                      // this is an exact half way case for rounding
+        clo |= chi << 63; // move lowest high bit into highest bit of lo
+        chi >>= 1; // right shift hi
+        chi |= 0x8000000000000000ULL; // move the carry bit into hi.
         *exponent = *exponent + 1;
     }
 
@@ -2514,48 +2915,51 @@ static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong cl
 }
 
 // 128-bit subtract  of ((chi << 64) + clo)  - ((*hi << 64) + *lo)
-static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong lo, cl_ulong *signC, int *expC );
-static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong lo, cl_ulong *signC, int *expC )
+static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
+                          cl_ulong lo, cl_ulong *signC, int *expC);
+static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
+                          cl_ulong lo, cl_ulong *signC, int *expC)
 {
     cl_ulong rHi = *chi;
     cl_ulong rLo = *clo;
     cl_ulong carry, carry2;
 
-    //extended precision subtract
+    // extended precision subtract
     rLo = sub_carry(rLo, lo, &carry);
     rHi = sub_carry(rHi, hi, &carry2);
     rHi = sub_carry(rHi, carry, &carry);
 
     // Check for sign flip
-    if( carry || carry2 )
+    if (carry || carry2)
     {
         *signC ^= 0x8000000000000000ULL;
 
-        //negate rLo, rHi:   -x = (x ^ -1) + 1
+        // negate rLo, rHi:   -x = (x ^ -1) + 1
         rLo ^= -1ULL;
         rHi ^= -1ULL;
         rLo++;
         rHi += 0 == rLo;
     }
 
-    // normalize -- move the most significant non-zero bit to the MSB, and adjust exponent accordingly
-    if( rHi == 0 )
+    // normalize -- move the most significant non-zero bit to the MSB, and
+    // adjust exponent accordingly
+    if (rHi == 0)
     {
         rHi = rLo;
         *expC = *expC - 64;
         rLo = 0;
     }
 
-    if( rHi )
+    if (rHi)
     {
         int shift = 32;
         cl_ulong test = 1ULL << 32;
-        while( 0 == (rHi & 0x8000000000000000ULL))
+        while (0 == (rHi & 0x8000000000000000ULL))
         {
-            if( rHi < test )
+            if (rHi < test)
             {
                 rHi <<= shift;
-                rHi |= rLo >> (64-shift);
+                rHi |= rLo >> (64 - shift);
                 rLo <<= shift;
                 *expC = *expC - shift;
             }
@@ -2565,7 +2969,7 @@ static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong l
     }
     else
     {
-        //zero
+        // zero
         *expC = INT_MIN;
         *signC = 0;
     }
@@ -2575,7 +2979,7 @@ static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong l
     *clo = rLo;
 }
 
-long double reference_fmal( long double x, long double y, long double z)
+long double reference_fmal(long double x, long double y, long double z)
 {
     static const cl_ulong kMSB = 0x8000000000000000ULL;
 
@@ -2585,75 +2989,91 @@ long double reference_fmal( long double x, long double y, long double z)
     double c = z;
 
     // Make bits accessible
-    union{ cl_ulong u; cl_double d; } ua; ua.d = a;
-    union{ cl_ulong u; cl_double d; } ub; ub.d = b;
-    union{ cl_ulong u; cl_double d; } uc; uc.d = c;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } ua;
+    ua.d = a;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } ub;
+    ub.d = b;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } uc;
+    uc.d = c;
 
     // deal with Nans, infinities and zeros
-    if( isnan( a ) || isnan( b ) || isnan(c)    ||
-        isinf( a ) || isinf( b ) || isinf(c)    ||
-        0 == ( ua.u & ~kMSB)                ||  // a == 0, defeat host FTZ behavior
-        0 == ( ub.u & ~kMSB)                ||  // b == 0, defeat host FTZ behavior
-        0 == ( uc.u & ~kMSB)                )   // c == 0, defeat host FTZ behavior
+    if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
+        || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
+        0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
+        0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
     {
-        if( isinf( c ) && !isinf(a) && !isinf(b) )
-            return (c + a) + b;
+        if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
 
-        a = (double) reference_multiplyl( a, b );   // some risk that the compiler will insert a non-compliant fma here on some platforms.
-        return reference_addl(a, c);                // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
+        a = (double)reference_multiplyl(
+            a, b); // some risk that the compiler will insert a non-compliant
+                   // fma here on some platforms.
+        return reference_addl(
+            a,
+            c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
     }
 
     // extract exponent and mantissa
     //   exponent is a standard unbiased signed integer
     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
     cl_ulong mantA, mantB, mantC;
-    int expA = extract( a, &mantA );
-    int expB = extract( b, &mantB );
-    int expC = extract( c, &mantC );
-    cl_ulong signC = uc.u & kMSB;               // We'll need the sign bit of C later to decide if we are adding or subtracting
+    int expA = extract(a, &mantA);
+    int expB = extract(b, &mantB);
+    int expC = extract(c, &mantC);
+    cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to
+                                  // decide if we are adding or subtracting
 
-// exact product of A and B
+    // exact product of A and B
     int exponent = expA + expB;
     cl_ulong sign = (ua.u ^ ub.u) & kMSB;
     cl_ulong hi, lo;
-    mul128( mantA, mantB, &hi, &lo );
+    mul128(mantA, mantB, &hi, &lo);
 
     // renormalize
-    if( 0 == (kMSB & hi) )
+    if (0 == (kMSB & hi))
     {
         hi <<= 1;
         hi |= lo >> 63;
         lo <<= 1;
     }
     else
-        exponent++;         // 2**63 * 2**63 gives 2**126. If the MSB was set, then our exponent increased.
+        exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our
+                    // exponent increased.
 
-//infinite precision add
+    // infinite precision add
     cl_ulong chi = mantC;
     cl_ulong clo = 0;
 
-    if( exponent >= expC )
+    if (exponent >= expC)
     {
         // Normalize C relative to the product
-        if( exponent > expC )
-            shift_right_sticky_128( &chi, &clo, exponent - expC );
+        if (exponent > expC)
+            shift_right_sticky_128(&chi, &clo, exponent - expC);
 
         // Add
-        if( sign ^ signC )
-            sub128( &hi, &lo, chi, clo, &sign, &exponent );
+        if (sign ^ signC)
+            sub128(&hi, &lo, chi, clo, &sign, &exponent);
         else
-            add128( &hi, &lo, chi, clo, &exponent );
+            add128(&hi, &lo, chi, clo, &exponent);
     }
     else
     {
         // Shift the product relative to C so that their exponents match
-        shift_right_sticky_128( &hi, &lo, expC - exponent );
+        shift_right_sticky_128(&hi, &lo, expC - exponent);
 
         // add
-        if( sign ^ signC )
-            sub128( &chi, &clo, hi, lo, &signC, &expC );
+        if (sign ^ signC)
+            sub128(&chi, &clo, hi, lo, &signC, &expC);
         else
-            add128( &chi, &clo, hi, lo, &expC );
+            add128(&chi, &clo, hi, lo, &expC);
 
         hi = chi;
         lo = clo;
@@ -2671,61 +3091,54 @@ long double reference_fmal( long double x, long double y, long double z)
 }
 
 
+long double reference_madl(long double a, long double b, long double c)
+{
+    return a * b + c;
+}
 
+// long double my_nextafterl(long double x, long double y){  return (long
+// double) nextafter( (double) x, (double) y ); }
 
-long double reference_madl( long double a, long double b, long double c) { return a * b + c; }
-
-//long double my_nextafterl(long double x, long double y){  return (long double) nextafter( (double) x, (double) y ); }
-
-long double reference_recipl( long double x){ return 1.0L / x; }
+long double reference_recipl(long double x) { return 1.0L / x; }
 
-long double reference_rootnl( long double x, int i)
+long double reference_rootnl(long double x, int i)
 {
-    double hi,  lo;
+    double hi, lo;
     long double l;
-    //rootn ( x, 0 )  returns a NaN.
-    if( 0 == i )
-        return cl_make_nan();
+    // rootn ( x, 0 )  returns a NaN.
+    if (0 == i) return cl_make_nan();
 
-    //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-    if( x < 0.0L && 0 == (i&1) )
-        return cl_make_nan();
+    // rootn ( x, n )  returns a NaN for x < 0 and n is even.
+    if (x < 0.0L && 0 == (i & 1)) return cl_make_nan();
 
-    if( isinf(x) )
+    if (isinf(x))
     {
-        if( i < 0 )
-            return reference_copysignl(0.0L, x);
+        if (i < 0) return reference_copysignl(0.0L, x);
 
         return x;
     }
 
-    if( x == 0.0 )
+    if (x == 0.0)
     {
-        switch( i & 0x80000001 )
+        switch (i & 0x80000001)
         {
-            //rootn ( +-0,  n ) is +0 for even n > 0.
-            case 0:
-                return 0.0L;
+            // rootn ( +-0,  n ) is +0 for even n > 0.
+            case 0: return 0.0L;
 
-            //rootn ( +-0,  n ) is +-0 for odd n > 0.
-            case 1:
-                return x;
+            // rootn ( +-0,  n ) is +-0 for odd n > 0.
+            case 1: return x;
 
-            //rootn ( +-0,  n ) is +inf for even n < 0.
-            case 0x80000000:
-                return INFINITY;
+            // rootn ( +-0,  n ) is +inf for even n < 0.
+            case 0x80000000: return INFINITY;
 
-            //rootn ( +-0,  n ) is +-inf for odd n < 0.
-            case 0x80000001:
-                return copysign(INFINITY, x);
+            // rootn ( +-0,  n ) is +-inf for odd n < 0.
+            case 0x80000001: return copysign(INFINITY, x);
         }
     }
 
-    if( i == 1 )
-        return x;
+    if (i == 1) return x;
 
-    if( i == -1 )
-        return 1.0 / x;
+    if (i == -1) return 1.0 / x;
 
     long double sign = x;
     x = reference_fabsl(x);
@@ -2733,167 +3146,174 @@ long double reference_rootnl( long double x, int i)
     DivideDD(&iHi, &iLo, 1.0, i);
     x = reference_powl(x, iHi) * reference_powl(x, iLo);
 
-    return reference_copysignl( x, sign );
-
+    return reference_copysignl(x, sign);
 }
 
-long double reference_rsqrtl( long double x){ return 1.0L / sqrtl(x); }
-//long double reference_sincosl( long double x, long double *c ){ *c = reference_cosl(x); return reference_sinl(x); }
-long double reference_sinpil( long double x)
+long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); }
+// long double reference_sincosl( long double x, long double *c ){ *c =
+// reference_cosl(x); return reference_sinl(x); }
+long double reference_sinpil(long double x)
 {
     double r = reduce1l(x);
 
     // reduce to [-0.5, 0.5]
-    if( r < -0.5L )
+    if (r < -0.5L)
         r = -1.0L - r;
-    else if ( r > 0.5L )
+    else if (r > 0.5L)
         r = 1.0L - r;
 
     // sinPi zeros have the same sign as x
-    if( r == 0.0L )
-        return reference_copysignl(0.0L, x);
+    if (r == 0.0L) return reference_copysignl(0.0L, x);
 
-    return reference_sinl( r * M_PIL );
+    return reference_sinl(r * M_PIL);
 }
 
-long double reference_tanpil( long double x)
+long double reference_tanpil(long double x)
 {
     // set aside the sign  (allows us to preserve sign of -0)
-    long double sign = reference_copysignl( 1.0L, x);
+    long double sign = reference_copysignl(1.0L, x);
     long double z = reference_fabsl(x);
 
     // if big and even  -- caution: only works if x only has single precision
-    if( z >= HEX_LDBL( +, 1, 0, +, 53 ) )
+    if (z >= HEX_LDBL(+, 1, 0, +, 53))
     {
-        if( z == INFINITY )
-            return x - x;       // nan
+        if (z == INFINITY) return x - x; // nan
 
-        return reference_copysignl( 0.0L, x);   // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
+        return reference_copysignl(
+            0.0L, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
     }
 
     // reduce to the range [ -0.5, 0.5 ]
-    long double nearest = reference_rintl( z );     // round to nearest even places n + 0.5 values in the right place for us
-    int64_t i = (int64_t) nearest;          // test above against 0x1.0p53 avoids overflow here
+    long double nearest =
+        reference_rintl(z); // round to nearest even places n + 0.5 values in
+                            // the right place for us
+    int64_t i =
+        (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here
     z -= nearest;
 
-    //correction for odd integer x for the right sign of zero
-    if( (i&1) && z == 0.0L )
-        sign = -sign;
+    // correction for odd integer x for the right sign of zero
+    if ((i & 1) && z == 0.0L) sign = -sign;
 
     // track changes to the sign
-    sign *= reference_copysignl(1.0L, z);       // really should just be an xor
-    z = reference_fabsl(z);                    // remove the sign again
+    sign *= reference_copysignl(1.0L, z); // really should just be an xor
+    z = reference_fabsl(z); // remove the sign again
 
     // reduce once more
-    // If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly
-    if( z > 0.25L )
+    // If we don't do this, rounding error in z * M_PI will cause us not to
+    // return infinities properly
+    if (z > 0.25L)
     {
         z = 0.5L - z;
-        return sign / reference_tanl( z * M_PIL );      // use system tan to get the right result
+        return sign
+            / reference_tanl(z
+                             * M_PIL); // use system tan to get the right result
     }
 
     //
-    return sign * reference_tanl( z * M_PIL );          // use system tan to get the right result
+    return sign
+        * reference_tanl(z * M_PIL); // use system tan to get the right result
 }
 
-long double reference_pownl( long double x, int i ){ return reference_powl( x, (long double) i ); }
+long double reference_pownl(long double x, int i)
+{
+    return reference_powl(x, (long double)i);
+}
 
-long double reference_powrl( long double x, long double y )
+long double reference_powrl(long double x, long double y)
 {
-    //powr ( x, y ) returns NaN for x < 0.
-    if( x < 0.0L )
-        return cl_make_nan();
+    // powr ( x, y ) returns NaN for x < 0.
+    if (x < 0.0L) return cl_make_nan();
 
-    //powr ( x, NaN ) returns the NaN for x >= 0.
-    //powr ( NaN, y ) returns the NaN.
-    if( isnan(x) || isnan(y) )
-        return x + y;   // Note: behavior different here than for pow(1,NaN), pow(NaN, 0)
+    // powr ( x, NaN ) returns the NaN for x >= 0.
+    // powr ( NaN, y ) returns the NaN.
+    if (isnan(x) || isnan(y))
+        return x + y; // Note: behavior different here than for pow(1,NaN),
+                      // pow(NaN, 0)
 
-    if( x == 1.0L )
+    if (x == 1.0L)
     {
-        //powr ( +1, +-inf ) returns NaN.
-        if( reference_fabsl(y) == INFINITY )
-            return cl_make_nan();
+        // powr ( +1, +-inf ) returns NaN.
+        if (reference_fabsl(y) == INFINITY) return cl_make_nan();
 
-        //powr ( +1, y ) is 1 for finite y.    (NaN handled above)
+        // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
         return 1.0L;
     }
 
-    if( y == 0.0L )
+    if (y == 0.0L)
     {
-        //powr ( +inf, +-0 ) returns NaN.
-        //powr ( +-0, +-0 ) returns NaN.
-        if( x == 0.0L || x == INFINITY )
-            return cl_make_nan();
+        // powr ( +inf, +-0 ) returns NaN.
+        // powr ( +-0, +-0 ) returns NaN.
+        if (x == 0.0L || x == INFINITY) return cl_make_nan();
 
-        //powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already handled above)
+        // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
+        // handled above)
         return 1.0L;
     }
 
-    if( x == 0.0L )
+    if (x == 0.0L)
     {
-        //powr ( +-0, -inf) is +inf.
-        //powr ( +-0, y ) is +inf for finite y < 0.
-        if( y < 0.0L )
-            return INFINITY;
+        // powr ( +-0, -inf) is +inf.
+        // powr ( +-0, y ) is +inf for finite y < 0.
+        if (y < 0.0L) return INFINITY;
 
-        //powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
+        // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
         return 0.0L;
     }
 
-    return reference_powl( x, y );
+    return reference_powl(x, y);
 }
 
-//long double my_fdiml( long double x, long double y){ return fdim( (double) x, (double) y ); }
-long double reference_addl( long double x, long double y)
+// long double my_fdiml( long double x, long double y){ return fdim( (double) x,
+// (double) y ); }
+long double reference_addl(long double x, long double y)
 {
-    volatile double a = (double) x;
-    volatile double b = (double) y;
+    volatile double a = (double)x;
+    volatile double b = (double)y;
 
-#if defined( __SSE2__ )
+#if defined(__SSE2__)
     // defeat x87
-    __m128d va = _mm_set_sd( (double) a );
-    __m128d vb = _mm_set_sd( (double) b );
-    va = _mm_add_sd( va, vb );
-    _mm_store_sd( (double*) &a, va );
+    __m128d va = _mm_set_sd((double)a);
+    __m128d vb = _mm_set_sd((double)b);
+    va = _mm_add_sd(va, vb);
+    _mm_store_sd((double *)&a, va);
 #else
     a += b;
 #endif
-    return (long double) a;
+    return (long double)a;
 }
 
-long double reference_subtractl( long double x, long double y)
+long double reference_subtractl(long double x, long double y)
 {
-    volatile double a = (double) x;
-    volatile double b = (double) y;
+    volatile double a = (double)x;
+    volatile double b = (double)y;
 
-#if defined( __SSE2__ )
+#if defined(__SSE2__)
     // defeat x87
-    __m128d va = _mm_set_sd( (double) a );
-    __m128d vb = _mm_set_sd( (double) b );
-    va = _mm_sub_sd( va, vb );
-    _mm_store_sd( (double*) &a, va );
+    __m128d va = _mm_set_sd((double)a);
+    __m128d vb = _mm_set_sd((double)b);
+    va = _mm_sub_sd(va, vb);
+    _mm_store_sd((double *)&a, va);
 #else
     a -= b;
 #endif
-    return (long double) a;
+    return (long double)a;
 }
 
-long double reference_multiplyl( long double x, long double y)
+long double reference_multiplyl(long double x, long double y)
 {
-    volatile double a = (double) x;
-    volatile double b = (double) y;
+    volatile double a = (double)x;
+    volatile double b = (double)y;
 
-#if defined( __SSE2__ )
+#if defined(__SSE2__)
     // defeat x87
-    __m128d va = _mm_set_sd( (double) a );
-    __m128d vb = _mm_set_sd( (double) b );
-    va = _mm_mul_sd( va, vb );
-    _mm_store_sd( (double*) &a, va );
+    __m128d va = _mm_set_sd((double)a);
+    __m128d vb = _mm_set_sd((double)b);
+    va = _mm_mul_sd(va, vb);
+    _mm_store_sd((double *)&a, va);
 #else
     a *= b;
 #endif
-    return (long double) a;
+    return (long double)a;
 }
 
 /*long double my_remquol( long double x, long double y, int *iptr )
@@ -2908,22 +3328,22 @@ long double reference_multiplyl( long double x, long double y)
 
     return remquo( (double) x, (double) y, iptr );
 }*/
-long double reference_lgamma_rl( long double x, int *signp )
+long double reference_lgamma_rl(long double x, int *signp)
 {
-//    long double lgamma_val = (long double)reference_lgamma( (double)x );
-//    *signp = signgam;
+    //    long double lgamma_val = (long double)reference_lgamma( (double)x );
+    //    *signp = signgam;
     *signp = 0;
     return x;
 }
 
 
-int reference_isequall( long double x, long double y){ return x == y; }
-int reference_isfinitel( long double x){ return 0 != isfinite(x); }
-int reference_isgreaterl( long double x, long double y){ return x > y; }
-int reference_isgreaterequall( long double x, long double y){ return x >= y; }
-int reference_isinfl( long double x){ return 0 != isinf(x); }
-int reference_islessl( long double x, long double y){ return x < y; }
-int reference_islessequall( long double x, long double y){ return x <= y; }
+int reference_isequall(long double x, long double y) { return x == y; }
+int reference_isfinitel(long double x) { return 0 != isfinite(x); }
+int reference_isgreaterl(long double x, long double y) { return x > y; }
+int reference_isgreaterequall(long double x, long double y) { return x >= y; }
+int reference_isinfl(long double x) { return 0 != isinf(x); }
+int reference_islessl(long double x, long double y) { return x < y; }
+int reference_islessequall(long double x, long double y) { return x <= y; }
 #if defined(__INTEL_COMPILER)
 int reference_islessgreaterl(long double x, long double y)
 {
@@ -2935,69 +3355,77 @@ int reference_islessgreaterl(long double x, long double y)
     return 0 != islessgreater(x, y);
 }
 #endif
-int reference_isnanl( long double x){ return 0 != isnan( x ); }
-int reference_isnormall( long double x){ return 0 != isnormal( (double) x ); }
-int reference_isnotequall( long double x, long double y){ return x != y; }
-int reference_isorderedl( long double x, long double y){ return x == x && y == y; }
-int reference_isunorderedl( long double x, long double y){ return isnan(x) || isnan( y ); }
-#if defined( __INTEL_COMPILER )
-int reference_signbitl( long double x){ return 0 != signbitl( x ); }
+int reference_isnanl(long double x) { return 0 != isnan(x); }
+int reference_isnormall(long double x) { return 0 != isnormal((double)x); }
+int reference_isnotequall(long double x, long double y) { return x != y; }
+int reference_isorderedl(long double x, long double y)
+{
+    return x == x && y == y;
+}
+int reference_isunorderedl(long double x, long double y)
+{
+    return isnan(x) || isnan(y);
+}
+#if defined(__INTEL_COMPILER)
+int reference_signbitl(long double x) { return 0 != signbitl(x); }
 #else
-int reference_signbitl( long double x){ return 0 != signbit( x ); }
+int reference_signbitl(long double x) { return 0 != signbit(x); }
 #endif
-long double reference_copysignl( long double x, long double y);
-long double reference_roundl( long double x );
+long double reference_copysignl(long double x, long double y);
+long double reference_roundl(long double x);
 long double reference_cbrtl(long double x);
 
-long double reference_copysignl( long double x, long double y )
+long double reference_copysignl(long double x, long double y)
 {
-    // We hope that the long double to double conversion proceeds with sign fidelity,
-    // even for zeros and NaNs
-    union{ double d; cl_ulong u;}u; u.d = (double) y;
+    // We hope that the long double to double conversion proceeds with sign
+    // fidelity, even for zeros and NaNs
+    union {
+        double d;
+        cl_ulong u;
+    } u;
+    u.d = (double)y;
 
     x = reference_fabsl(x);
-    if( u.u >> 63 )
-        x = -x;
+    if (u.u >> 63) x = -x;
 
     return x;
 }
 
-long double reference_roundl( long double x )
+long double reference_roundl(long double x)
 {
     // Since we are just using this to verify double precision, we can
     // use the double precision copysign here
 
 #if defined(__MINGW32__) && defined(__x86_64__)
     long double absx = reference_fabsl(x);
-    if (absx < 0.5L)
-    return reference_copysignl(0.0L, x);
+    if (absx < 0.5L) return reference_copysignl(0.0L, x);
 #endif
-    return round( (double) x );
+    return round((double)x);
 }
 
-long double reference_truncl( long double x )
+long double reference_truncl(long double x)
 {
     // Since we are just using this to verify double precision, we can
     // use the double precision copysign here
-    return trunc( (double) x );
+    return trunc((double)x);
 }
 
 static long double reference_scalblnl(long double x, long n);
 
 long double reference_cbrtl(long double x)
 {
-    double yhi = HEX_DBL( +, 1, 5555555555555, -, 2 );
-    double ylo = HEX_DBL( +, 1, 558, -, 56 );
+    double yhi = HEX_DBL(+, 1, 5555555555555, -, 2);
+    double ylo = HEX_DBL(+, 1, 558, -, 56);
 
-    double fabsx = reference_fabs( x );
+    double fabsx = reference_fabs(x);
 
-    if( isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x) )
-        return x;
+    if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x;
 
     double iy = 0.0;
     double log2x_hi, log2x_lo;
 
-    // extended precision log .... accurate to at least 64-bits + couple of guard bits
+    // extended precision log .... accurate to at least 64-bits + couple of
+    // guard bits
     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
 
     double ylog2x_hi, ylog2x_lo;
@@ -3009,20 +3437,24 @@ long double reference_cbrtl(long double x)
     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
 
     long double powxy;
-    if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) {
-        powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY;
-    } else {
+    if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
+    {
+        powxy =
+            reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
+    }
+    else
+    {
         // separate integer + fractional part
         long int m = lrint(ylog2x_hi);
         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
 
         // revert to long double arithemtic
-        long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo;
-        powxy = reference_exp2l( ylog2x );
+        long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
+        powxy = reference_exp2l(ylog2x);
         powxy = reference_scalblnl(powxy, m);
     }
 
-    return reference_copysignl( powxy, x );
+    return reference_copysignl(powxy, x);
 }
 
 /*
@@ -3064,24 +3496,24 @@ long double scalbnl( long double x, int i )
 }
 */
 
-long double reference_rintl( long double x )
+long double reference_rintl(long double x)
 {
 #if defined(__PPC__)
-  // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
-  // mantissa can represent more than LDBL_MANT_DIG binary digits.
-  x = rintl(x);
+    // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
+    // mantissa can represent more than LDBL_MANT_DIG binary digits.
+    x = rintl(x);
 #else
-    static long double magic[2] = { 0.0L, 0.0L};
+    static long double magic[2] = { 0.0L, 0.0L };
 
-    if( 0.0L == magic[0] )
+    if (0.0L == magic[0])
     {
         magic[0] = scalbnl(0.5L, LDBL_MANT_DIG);
         magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG);
     }
 
-    if( reference_fabsl(x) < magic[0] && x != 0.0L )
+    if (reference_fabsl(x) < magic[0] && x != 0.0L)
     {
-        long double m = magic[ x < 0 ];
+        long double m = magic[x < 0];
         x += m;
         x -= m;
     }
@@ -3094,7 +3526,7 @@ long double reference_rintl( long double x )
 static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
 {
     // approximate reciprocal sqrt
-    double thi = 1.0 / sqrt( xhi );
+    double thi = 1.0 / sqrt(xhi);
     double tlo = 0.0;
 
     // One newton iteration in double-double
@@ -3108,34 +3540,31 @@ static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
     MulDD(rhi, rlo, yhi, ylo, xhi, xlo);
 }
 
-long double reference_acoshl( long double x )
+long double reference_acoshl(long double x)
 {
-/*
- * ====================================================
- * This function derived from fdlibm http://www.netlib.org
- * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- *
- */
-    if( isnan(x) || isinf(x))
-        return x + fabsl(x);
+    /*
+     * ====================================================
+     * This function derived from fdlibm http://www.netlib.org
+     * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     *
+     */
+    if (isnan(x) || isinf(x)) return x + fabsl(x);
 
-    if( x < 1.0L )
-        return cl_make_nan();
+    if (x < 1.0L) return cl_make_nan();
 
-    if( x == 1.0L )
-        return 0.0L;
+    if (x == 1.0L) return 0.0L;
 
-    if( x > HEX_LDBL( +, 1, 0, +, 60 ) )
+    if (x > HEX_LDBL(+, 1, 0, +, 60))
         return reference_logl(x) + 0.693147180559945309417232121458176568L;
 
-    if( x > 2.0L )
-        return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x*x - 1.0L)));
+    if (x > 2.0L)
+        return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L)));
 
     double hi, lo;
     MulD(&hi, &lo, x, x);
@@ -3144,286 +3573,301 @@ long double reference_acoshl( long double x )
     AddDD(&hi, &lo, hi, lo, x, 0.0);
     double correction = lo / hi;
     __log2_ep(&hi, &lo, hi);
-    double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 );
-    double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 );
+    double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
+    double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
     MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo);
     AddDD(&hi, &lo, hi, lo, correction, 0.0);
 
     return hi + lo;
 }
 
-long double reference_asinhl( long double x )
+long double reference_asinhl(long double x)
 {
     long double cutoff = 0.0L;
-    const long double ln2 = HEX_LDBL( +, b, 17217f7d1cf79ab, -, 4 );
+    const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4);
 
-    if( cutoff == 0.0L )
-        cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
+    if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
 
-    if( isnan(x) || isinf(x) )
-        return x + x;
+    if (isnan(x) || isinf(x)) return x + x;
 
     long double absx = reference_fabsl(x);
-    if( absx < cutoff )
-        return x;
+    if (absx < cutoff) return x;
 
     long double sign = reference_copysignl(1.0L, x);
 
-    if( absx <= 4.0/3.0 ) {
-        return sign * reference_log1pl( absx + x*x / (1.0 + sqrtl(1.0 + x*x)));
+    if (absx <= 4.0 / 3.0)
+    {
+        return sign
+            * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x)));
     }
-    else if( absx <= HEX_LDBL( +, 1, 0, +, 27 ) ) {
-        return sign * reference_logl( 2.0L * absx + 1.0L / (sqrtl( x * x + 1.0 ) + absx));
+    else if (absx <= HEX_LDBL(+, 1, 0, +, 27))
+    {
+        return sign
+            * reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx));
     }
-    else {
-        return sign * ( reference_logl( absx ) + ln2 );
+    else
+    {
+        return sign * (reference_logl(absx) + ln2);
     }
 }
 
-long double reference_atanhl( long double x )
+long double reference_atanhl(long double x)
 {
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if( isnan(x)  )
-        return x + x;
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (isnan(x)) return x + x;
 
-    long double signed_half = reference_copysignl( 0.5L, x );
+    long double signed_half = reference_copysignl(0.5L, x);
     x = reference_fabsl(x);
-    if( x > 1.0L )
-        return cl_make_nan();
+    if (x > 1.0L) return cl_make_nan();
 
-    if( x < 0.5L )
-        return signed_half * reference_log1pl( 2.0L * ( x + x*x / (1-x) ) );
+    if (x < 0.5L)
+        return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x)));
 
-    return signed_half * reference_log1pl(2.0L * x / (1-x));
+    return signed_half * reference_log1pl(2.0L * x / (1 - x));
 }
 
-long double reference_exp2l(  long double z)
+long double reference_exp2l(long double z)
 {
     double_double x;
     int j;
 
     // Handle NaNs
-    if( isnan(z) )
-        return z;
+    if (isnan(z)) return z;
 
     // init x
     x.hi = z;
     x.lo = z - x.hi;
 
-    //Deal with overflow and underflow for exp2(x) stage next
-    if( x.hi >= 1025 )
-        return INFINITY;
+    // Deal with overflow and underflow for exp2(x) stage next
+    if (x.hi >= 1025) return INFINITY;
 
-    if( x.hi < -1075-24 )
-        return +0.0;
+    if (x.hi < -1075 - 24) return +0.0;
 
     // find nearest integer to x
-    int i = (int) rint(x.hi);
+    int i = (int)rint(x.hi);
 
     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
     x.hi -= i;
 
-    // We could attempt to find a minimax polynomial for exp2(x) over the range x = [-0.5, 0.5].
-    // However, this would converge very slowly near the extrema, where 0.5**n is not a lot different
-    // from 0.5**(n+1), thereby requiring something like a 20th order polynomial to get 53 + 24 bits
-    // of precision. Instead we further reduce the range to [-1/32, 1/32] by observing that
+    // We could attempt to find a minimax polynomial for exp2(x) over the range
+    // x = [-0.5, 0.5]. However, this would converge very slowly near the
+    // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
+    // requiring something like a 20th order polynomial to get 53 + 24 bits of
+    // precision. Instead we further reduce the range to [-1/32, 1/32] by
+    // observing that
     //
     //  2**(a+b) = 2**a * 2**b
     //
-    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and reduce the range
-    // of x to [-1/32, 1/32] by subtracting away the nearest value of n/16 from x.
-    const double_double corrections[17] =
-    {
-        { HEX_DBL( +, 1, 6a09e667f3bcd, -, 1 ), HEX_DBL( -, 1, bdd3413b26456, -, 55 ) },
-        { HEX_DBL( +, 1, 7a11473eb0187, -, 1 ), HEX_DBL( -, 1, 41577ee04992f, -, 56 ) },
-        { HEX_DBL( +, 1, 8ace5422aa0db, -, 1 ), HEX_DBL( +, 1, 6e9f156864b27, -, 55 ) },
-        { HEX_DBL( +, 1, 9c49182a3f09,  -, 1 ), HEX_DBL( +, 1, c7c46b071f2be, -, 57 ) },
-        { HEX_DBL( +, 1, ae89f995ad3ad, -, 1 ), HEX_DBL( +, 1, 7a1cd345dcc81, -, 55 ) },
-        { HEX_DBL( +, 1, c199bdd85529c, -, 1 ), HEX_DBL( +, 1, 11065895048dd, -, 56 ) },
-        { HEX_DBL( +, 1, d5818dcfba487, -, 1 ), HEX_DBL( +, 1, 2ed02d75b3707, -, 56 ) },
-        { HEX_DBL( +, 1, ea4afa2a490da, -, 1 ), HEX_DBL( -, 1, e9c23179c2893, -, 55 ) },
-        { HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ) },
-        { HEX_DBL( +, 1, 0b5586cf9890f, +, 0 ), HEX_DBL( +, 1, 8a62e4adc610b, -, 54 ) },
-        { HEX_DBL( +, 1, 172b83c7d517b, +, 0 ), HEX_DBL( -, 1, 19041b9d78a76, -, 55 ) },
-        { HEX_DBL( +, 1, 2387a6e756238, +, 0 ), HEX_DBL( +, 1, 9b07eb6c70573, -, 54 ) },
-        { HEX_DBL( +, 1, 306fe0a31b715, +, 0 ), HEX_DBL( +, 1, 6f46ad23182e4, -, 55 ) },
-        { HEX_DBL( +, 1, 3dea64c123422, +, 0 ), HEX_DBL( +, 1, ada0911f09ebc, -, 55 ) },
-        { HEX_DBL( +, 1, 4bfdad5362a27, +, 0 ), HEX_DBL( +, 1, d4397afec42e2, -, 56 ) },
-        { HEX_DBL( +, 1, 5ab07dd485429, +, 0 ), HEX_DBL( +, 1, 6324c054647ad, -, 54 ) },
-        { HEX_DBL( +, 1, 6a09e667f3bcd, +, 0 ), HEX_DBL( -, 1, bdd3413b26456, -, 54 ) }
+    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
+    // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
+    // value of n/16 from x.
+    const double_double corrections[17] = {
+        { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
+          HEX_DBL(-, 1, bdd3413b26456, -, 55) },
+        { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
+          HEX_DBL(-, 1, 41577ee04992f, -, 56) },
+        { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
+          HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
+        { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
+          HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
+        { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
+          HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
+        { HEX_DBL(+, 1, c199bdd85529c, -, 1),
+          HEX_DBL(+, 1, 11065895048dd, -, 56) },
+        { HEX_DBL(+, 1, d5818dcfba487, -, 1),
+          HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
+        { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
+          HEX_DBL(-, 1, e9c23179c2893, -, 55) },
+        { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+        { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
+          HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
+        { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
+          HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
+        { HEX_DBL(+, 1, 2387a6e756238, +, 0),
+          HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
+        { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
+          HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
+        { HEX_DBL(+, 1, 3dea64c123422, +, 0),
+          HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
+        { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
+          HEX_DBL(+, 1, d4397afec42e2, -, 56) },
+        { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
+          HEX_DBL(+, 1, 6324c054647ad, -, 54) },
+        { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
+          HEX_DBL(-, 1, bdd3413b26456, -, 54) }
     };
-    int index = (int) rint( x.hi * 16.0 );
-    x.hi -= (double) index * 0.0625;
+    int index = (int)rint(x.hi * 16.0);
+    x.hi -= (double)index * 0.0625;
 
     // canonicalize x
     double temp = x.hi;
     x.hi += x.lo;
     x.lo -= x.hi - temp;
 
-    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max Error: 2 * 0x1.e112p-87
-    const double_double c[] = {
-        {HEX_DBL( +, 1, 62e42fefa39ef, -,  1 ), HEX_DBL( +, 1, abc9e3ac1d244, -, 56 )},
-        {HEX_DBL( +, 1, ebfbdff82c58f, -,  3 ), HEX_DBL( -, 1, 5e4987a631846, -, 57 )},
-        {HEX_DBL( +, 1, c6b08d704a0c,  -,  5 ), HEX_DBL( -, 1, d323200a05713, -, 59 )},
-        {HEX_DBL( +, 1, 3b2ab6fba4e7a, -,  7 ), HEX_DBL( +, 1, c5ee8f8b9f0c1, -, 63 )},
-        {HEX_DBL( +, 1, 5d87fe78a672a, -, 10 ), HEX_DBL( +, 1, 884e5e5cc7ecc, -, 64 )},
-        {HEX_DBL( +, 1, 430912f7e8373, -, 13 ), HEX_DBL( +, 1, 4f1b59514a326, -, 67 )},
-        {HEX_DBL( +, 1, ffcbfc5985e71, -, 17 ), HEX_DBL( -, 1, db7d6a0953b78, -, 71 )},
-        {HEX_DBL( +, 1, 62c150eb16465, -, 20 ), HEX_DBL( +, 1, e0767c2d7abf5, -, 80 )},
-        {HEX_DBL( +, 1, b52502b5e953,  -, 24 ), HEX_DBL( +, 1, 6797523f944bc, -, 78 )}
-    };
-    size_t count = sizeof( c ) / sizeof( c[0] );
+    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
+    // Error: 2 * 0x1.e112p-87
+    const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
+                                  HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
+                                { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
+                                  HEX_DBL(-, 1, 5e4987a631846, -, 57) },
+                                { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
+                                  HEX_DBL(-, 1, d323200a05713, -, 59) },
+                                { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
+                                  HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
+                                { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
+                                  HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
+                                { HEX_DBL(+, 1, 430912f7e8373, -, 13),
+                                  HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
+                                { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
+                                  HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
+                                { HEX_DBL(+, 1, 62c150eb16465, -, 20),
+                                  HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
+                                { HEX_DBL(+, 1, b52502b5e953, -, 24),
+                                  HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
+    size_t count = sizeof(c) / sizeof(c[0]);
 
     // Do polynomial
-    double_double r = c[count-1];
-    for( j = (int) count-2; j >= 0; j-- )
-        r = add_dd( c[j], mul_dd( r, x ) );
+    double_double r = c[count - 1];
+    for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
 
     // unwind approximation
-    r = mul_dd( r, x );     // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
+    r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
 
     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
     //  exp2(x) = (r + 1) * correction = r * correction + correction
-    r = mul_dd( r, corrections[index+8] );
-    r = add_dd( r, corrections[index+8] );
+    r = mul_dd(r, corrections[index + 8]);
+    r = add_dd(r, corrections[index + 8]);
 
-// Format result for output:
+    // Format result for output:
 
     // Get mantissa
-    long double m = ((long double) r.hi + (long double) r.lo );
+    long double m = ((long double)r.hi + (long double)r.lo);
 
     // Handle a pesky overflow cases when long double = double
-    if( i > 512 )
+    if (i > 512)
     {
-        m *= HEX_DBL( +, 1, 0, +, 512 );
+        m *= HEX_DBL(+, 1, 0, +, 512);
         i -= 512;
     }
-    else if( i < -512 )
+    else if (i < -512)
     {
-        m *= HEX_DBL( +, 1, 0, -, 512 );
+        m *= HEX_DBL(+, 1, 0, -, 512);
         i += 512;
     }
 
-    return m * ldexpl( 1.0L, i );
+    return m * ldexpl(1.0L, i);
 }
 
-long double reference_expm1l(  long double x)
+long double reference_expm1l(long double x)
 {
-#if defined( _MSC_VER ) && ! defined( __INTEL_COMPILER )
-    //unimplemented
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // unimplemented
     return x;
 #else
-    union { double f; cl_ulong u;} u;
-    u.f = (double) x;
+    union {
+        double f;
+        cl_ulong u;
+    } u;
+    u.f = (double)x;
 
-    if (reference_isnanl(x))
-        return x;
+    if (reference_isnanl(x)) return x;
 
-    if ( x > 710 )
-        return INFINITY;
+    if (x > 710) return INFINITY;
 
     long double y = expm1l(x);
 
     // Range of expm1l is -1.0L to +inf. Negative inf
     // on a few Linux platforms is clearly the wrong sign.
-    if (reference_isinfl(y))
-        y = INFINITY;
+    if (reference_isinfl(y)) y = INFINITY;
 
     return y;
 #endif
 }
 
-long double reference_fmaxl( long double x, long double y )
+long double reference_fmaxl(long double x, long double y)
 {
-    if( isnan(y) )
-        return x;
+    if (isnan(y)) return x;
 
     return x >= y ? x : y;
 }
 
-long double reference_fminl( long double x, long double y )
+long double reference_fminl(long double x, long double y)
 {
-    if( isnan(y) )
-        return x;
+    if (isnan(y)) return x;
 
     return x <= y ? x : y;
 }
 
-long double reference_hypotl( long double x, long double y )
+long double reference_hypotl(long double x, long double y)
 {
-  static const double tobig = HEX_DBL( +, 1, 0, +, 511 );
-  static const double big = HEX_DBL( +, 1, 0, +, 513 );
-  static const double rbig = HEX_DBL( +, 1, 0, -, 513 );
-  static const double tosmall = HEX_DBL( +, 1, 0, -, 511 );
-  static const double smalll = HEX_DBL( +, 1, 0, -, 607 );
-  static const double rsmall = HEX_DBL( +, 1, 0, +, 607 );
+    static const double tobig = HEX_DBL(+, 1, 0, +, 511);
+    static const double big = HEX_DBL(+, 1, 0, +, 513);
+    static const double rbig = HEX_DBL(+, 1, 0, -, 513);
+    static const double tosmall = HEX_DBL(+, 1, 0, -, 511);
+    static const double smalll = HEX_DBL(+, 1, 0, -, 607);
+    static const double rsmall = HEX_DBL(+, 1, 0, +, 607);
 
     long double max, min;
 
-    if( isinf(x) || isinf(y) )
-        return INFINITY;
+    if (isinf(x) || isinf(y)) return INFINITY;
 
-    if( isnan(x) || isnan(y) )
-        return x + y;
+    if (isnan(x) || isnan(y)) return x + y;
 
     x = reference_fabsl(x);
     y = reference_fabsl(y);
 
-    max = reference_fmaxl( x, y );
-    min = reference_fminl( x, y );
+    max = reference_fmaxl(x, y);
+    min = reference_fminl(x, y);
 
-  if( max > tobig )
+    if (max > tobig)
     {
         max *= rbig;
         min *= rbig;
-        return big * sqrtl( max * max + min * min );
+        return big * sqrtl(max * max + min * min);
     }
 
-  if( max < tosmall )
+    if (max < tosmall)
     {
         max *= rsmall;
         min *= rsmall;
-      return smalll * sqrtl( max * max + min * min );
+        return smalll * sqrtl(max * max + min * min);
     }
-    return sqrtl( x * x + y * y );
+    return sqrtl(x * x + y * y);
 }
 
-//long double reference_log2l( long double x )
+// long double reference_log2l( long double x )
 //{
 //    return log( x ) * 1.44269504088896340735992468100189214L;
 //}
 
-long double reference_log2l( long double x )
+long double reference_log2l(long double x)
 {
-    if( isnan(x) || x < 0.0 || x == -INFINITY)
-        return NAN;
+    if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN;
 
-    if( x == 0.0f)
-        return -INFINITY;
+    if (x == 0.0f) return -INFINITY;
 
-    if( x == INFINITY )
-        return INFINITY;
+    if (x == INFINITY) return INFINITY;
 
     double hi, lo;
-    __log2_ep( &hi, &lo, x);
+    __log2_ep(&hi, &lo, x);
 
-    return (long double) hi + (long double) lo;
+    return (long double)hi + (long double)lo;
 }
 
-long double reference_log1pl(  long double x)
+long double reference_log1pl(long double x)
 {
-#if defined( _MSC_VER ) && ! defined( __INTEL_COMPILER )
-    //unimplemented
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // unimplemented
     return x;
 #elif defined(__PPC__)
     // log1pl on PPC inadvertantly returns NaN for very large values. Work
@@ -3434,23 +3878,24 @@ long double reference_log1pl(  long double x)
 #endif
 }
 
-long double reference_logbl( long double x )
+long double reference_logbl(long double x)
 {
     // Since we are just using this to verify double precision, we can
     // use the double precision copysign here
-    union { double f; cl_ulong u;} u;
-    u.f = (double) x;
+    union {
+        double f;
+        cl_ulong u;
+    } u;
+    u.f = (double)x;
 
     cl_int exponent = (cl_uint)(u.u >> 52) & 0x7ff;
-    if( exponent == 0x7ff )
-        return x * x;
+    if (exponent == 0x7ff) return x * x;
 
-    if( exponent == 0 )
-    {   // deal with denormals
-        u.f =  x * HEX_DBL( +, 1, 0, +, 64 );
+    if (exponent == 0)
+    { // deal with denormals
+        u.f = x * HEX_DBL(+, 1, 0, +, 64);
         exponent = (cl_int)(u.u >> 52) & 0x7ff;
-        if( exponent == 0 )
-            return -INFINITY;
+        if (exponent == 0) return -INFINITY;
 
         return exponent - (1023 + 64);
     }
@@ -3458,84 +3903,84 @@ long double reference_logbl( long double x )
     return exponent - 1023;
 }
 
-long double reference_maxmagl( long double x, long double y )
+long double reference_maxmagl(long double x, long double y)
 {
     long double fabsx = fabsl(x);
     long double fabsy = fabsl(y);
 
-    if( fabsx < fabsy )
-        return y;
+    if (fabsx < fabsy) return y;
 
-    if( fabsy < fabsx )
-        return x;
+    if (fabsy < fabsx) return x;
 
     return reference_fmaxl(x, y);
 }
 
-long double reference_minmagl( long double x, long double y )
+long double reference_minmagl(long double x, long double y)
 {
     long double fabsx = fabsl(x);
     long double fabsy = fabsl(y);
 
-    if( fabsx > fabsy )
-        return y;
+    if (fabsx > fabsy) return y;
 
-    if( fabsy > fabsx )
-        return x;
+    if (fabsy > fabsx) return x;
 
     return reference_fminl(x, y);
 }
 
-long double reference_nanl( cl_ulong x )
+long double reference_nanl(cl_ulong x)
 {
-    union{ cl_ulong u; cl_double f; }u;
+    union {
+        cl_ulong u;
+        cl_double f;
+    } u;
     u.u = x | 0x7ff8000000000000ULL;
-    return (long double) u.f;
+    return (long double)u.f;
 }
 
 
-long double reference_reciprocall( long double x )
-{
-    return 1.0L / x;
-}
+long double reference_reciprocall(long double x) { return 1.0L / x; }
 
-long double reference_remainderl( long double x, long double y );
-long double reference_remainderl( long double x, long double y )
+long double reference_remainderl(long double x, long double y);
+long double reference_remainderl(long double x, long double y)
 {
     int i;
-    return reference_remquol( x, y, &i );
+    return reference_remquol(x, y, &i);
 }
 
-long double reference_lgammal( long double x);
-long double reference_lgammal( long double x)
+long double reference_lgammal(long double x);
+long double reference_lgammal(long double x)
 {
     // lgamma is currently not tested
-    return reference_lgamma( x );
-}
-
-static uint32_t two_over_pi[] = { 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5, 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9,
-0x6e3a424, 0x374b8019, 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121, 0xe99c702, 0x1ad17df9,
-0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4, 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9,
-0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7, 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c,
-0x3046fc7b, 0x1aeafc33, 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102, 0xdffd880, 0x135cc9cc,
-0x10606155
+    return reference_lgamma(x);
+}
+
+static uint32_t two_over_pi[] = {
+    0x0,        0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5,
+    0x1993c439, 0x107f945,  0x23abdebb, 0x31586dc9, 0x6e3a424,  0x374b8019,
+    0x92eea09,  0x3464873f, 0x21deb1cb, 0x4a69cfb,  0x288235f5, 0xbaed121,
+    0xe99c702,  0x1ad17df9, 0x13991d6,  0xe60d4ce,  0x1f49c845, 0x3e2ef7e4,
+    0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d,  0xd9fb3c9,
+    0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7,
+    0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33,
+    0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102,
+    0xdffd880,  0x135cc9cc, 0x10606155
 };
 
-static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846, 0x26263314, 0x1701b839, 0x28948127 };
+static uint32_t pi_over_two[] = { 0x1,        0x2487ed51, 0x42d1846,
+                                  0x26263314, 0x1701b839, 0x28948127 };
 
-typedef union
-    {
-        uint64_t u;
-        double   d;
-    }d_ui64_t;
+typedef union {
+    uint64_t u;
+    double d;
+} d_ui64_t;
 
 // radix or base of representation
 #define RADIX (30)
 #define DIGITS 6
 
-d_ui64_t two_pow_pradix = { (uint64_t) (1023 + RADIX) << 52 };
-d_ui64_t two_pow_mradix = { (uint64_t) (1023 - RADIX) << 52 };
-d_ui64_t two_pow_two_mradix = { (uint64_t) (1023-2*RADIX) << 52 };
+d_ui64_t two_pow_pradix = { (uint64_t)(1023 + RADIX) << 52 };
+d_ui64_t two_pow_mradix = { (uint64_t)(1023 - RADIX) << 52 };
+d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 };
 
 #define tp_pradix two_pow_pradix.d
 #define tp_mradix two_pow_mradix.d
@@ -3544,11 +3989,12 @@ d_ui64_t two_pow_two_mradix = { (uint64_t) (1023-2*RADIX) << 52 };
 // floating point number.
 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
 typedef struct
-    {
-        uint32_t X[3];        // three 32 bit integers are sufficient to represnt double in base_30
-        int index;            // exponent bias
-        int sign;            // sign of double
-    }eprep_t;
+{
+    uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
+                   // base_30
+    int index; // exponent bias
+    int sign; // sign of double
+} eprep_t;
 
 static eprep_t double_to_eprep(double x);
 
@@ -3556,15 +4002,17 @@ static eprep_t double_to_eprep(double x)
 {
     eprep_t result;
 
-    result.sign = (signbit( x ) == 0) ? 1 : -1;
-    x = fabs( x );
+    result.sign = (signbit(x) == 0) ? 1 : -1;
+    x = fabs(x);
 
     int index = 0;
-    while( x > tp_pradix ) {
+    while (x > tp_pradix)
+    {
         index++;
         x *= tp_mradix;
     }
-    while( x < 1 ) {
+    while (x < 1)
+    {
         index--;
         x *= tp_pradix;
     }
@@ -3572,9 +4020,10 @@ static eprep_t double_to_eprep(double x)
     result.index = index;
     int i = 0;
     result.X[0] = result.X[1] = result.X[2] = 0;
-    while( x != 0.0 ) {
-        result.X[i] = (uint32_t) x;
-        x = (x - (double) result.X[i]) * tp_pradix;
+    while (x != 0.0)
+    {
+        result.X[i] = (uint32_t)x;
+        x = (x - (double)result.X[i]) * tp_pradix;
         i++;
     }
     return result;
@@ -3660,102 +4109,120 @@ static eprep_t double_to_eprep(double x)
  return sgn*res;
  }
  */
-static double eprep_to_double( eprep_t epx );
+static double eprep_to_double(eprep_t epx);
 
-static double eprep_to_double( eprep_t epx )
+static double eprep_to_double(eprep_t epx)
 {
     double res = 0.0;
 
-    res += ldexp((double) epx.X[0], (epx.index - 0)*RADIX);
-    res += ldexp((double) epx.X[1], (epx.index - 1)*RADIX);
-    res += ldexp((double) epx.X[2], (epx.index - 2)*RADIX);
+    res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX);
+    res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX);
+    res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX);
 
     return copysign(res, epx.sign);
 }
 
-static int payne_hanek( double *y, int *exception );
+static int payne_hanek(double *y, int *exception);
 
-static int payne_hanek( double *y, int *exception )
+static int payne_hanek(double *y, int *exception)
 {
     double x = *y;
 
     // exception cases .. no reduction required
-    if( isnan( x ) || isinf( x ) || (fabs( x ) <= M_PI_4) ) {
+    if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4))
+    {
         *exception = 1;
         return 0;
     }
 
     *exception = 0;
 
-    // After computation result[0] contains integer part while result[1]....result[DIGITS-1]
-    // contain fractional part. So we are doing computation with (DIGITS-1)*RADIX precision.
-    // Default DIGITS=6 and RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm
-    // shows that a double precision x, closest to pi/2 is 6381956970095103 x 2^797 which can
-    // cause 61 digits of cancellation in computation of f = x*2/pi - floor(x*2/pi) ... thus we need
-    // at least 114 bits (61 leading zeros + 53 bits of mentissa of f) of precision to accurately compute
-    // f in double precision. Since we are using 150 bits (still an overkill), we should be safe. Extra
-    // bits can act as guard bits for correct rounding.
-    uint64_t result[DIGITS+2];
+    // After computation result[0] contains integer part while
+    // result[1]....result[DIGITS-1] contain fractional part. So we are doing
+    // computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 and
+    // RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm shows
+    // that a double precision x, closest to pi/2 is 6381956970095103 x 2^797
+    // which can cause 61 digits of cancellation in computation of f = x*2/pi -
+    // floor(x*2/pi) ... thus we need at least 114 bits (61 leading zeros + 53
+    // bits of mentissa of f) of precision to accurately compute f in double
+    // precision. Since we are using 150 bits (still an overkill), we should be
+    // safe. Extra bits can act as guard bits for correct rounding.
+    uint64_t result[DIGITS + 2];
 
     // compute extended precision representation of x
-    eprep_t epx = double_to_eprep( x );
+    eprep_t epx = double_to_eprep(x);
     int index = epx.index;
     int i, j;
-    // extended precision multiplication of 2/pi*x .... we will loose at max two RADIX=30 bit digits in
-    // the worst case
-    for(i = 0; i < (DIGITS+2); i++) {
+    // extended precision multiplication of 2/pi*x .... we will loose at max two
+    // RADIX=30 bit digits in the worst case
+    for (i = 0; i < (DIGITS + 2); i++)
+    {
         result[i] = 0;
-        result[i] += ((index + i - 0) >= 0) ? ((uint64_t) two_over_pi[index + i - 0] * (uint64_t) epx.X[0]) : 0;
-        result[i] += ((index + i - 1) >= 0) ? ((uint64_t) two_over_pi[index + i - 1] * (uint64_t) epx.X[1]) : 0;
-        result[i] += ((index + i - 2) >= 0) ? ((uint64_t) two_over_pi[index + i - 2] * (uint64_t) epx.X[2]) : 0;
+        result[i] += ((index + i - 0) >= 0)
+            ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0])
+            : 0;
+        result[i] += ((index + i - 1) >= 0)
+            ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1])
+            : 0;
+        result[i] += ((index + i - 2) >= 0)
+            ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2])
+            : 0;
     }
 
     // Carry propagation.
     uint64_t tmp;
-    for(i = DIGITS+2-1; i > 0; i--) {
+    for (i = DIGITS + 2 - 1; i > 0; i--)
+    {
         tmp = result[i] >> RADIX;
         result[i - 1] += tmp;
         result[i] -= (tmp << RADIX);
     }
 
-    // we dont ned to normalize the integer part since only last two bits of this will be used
-    // subsequently algorithm which remain unaltered by this normalization.
-    // tmp = result[0] >> RADIX;
-    // result[0] -= (tmp << RADIX);
-    unsigned int N = (unsigned int) result[0];
+    // we dont ned to normalize the integer part since only last two bits of
+    // this will be used subsequently algorithm which remain unaltered by this
+    // normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << RADIX);
+    unsigned int N = (unsigned int)result[0];
 
-    // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that testing if the final
-    // x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to testing, at this stage, if r[1] (the first fractional
-    // digit) is greater than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to mentioned
-    // range is equivalent to substracting fractional part at this stage from one and changing the sign.
+    // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that
+    // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to
+    // testing, at this stage, if r[1] (the first fractional digit) is greater
+    // than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to
+    // mentioned range is equivalent to substracting fractional part at this
+    // stage from one and changing the sign.
     int sign = 1;
-    if(result[1] > (uint64_t)(1 << (RADIX - 1))) {
-        for(i = 1; i < (DIGITS + 2); i++)
+    if (result[1] > (uint64_t)(1 << (RADIX - 1)))
+    {
+        for (i = 1; i < (DIGITS + 2); i++)
             result[i] = (~((unsigned int)result[i]) & 0x3fffffff);
         N += 1;
         sign = -1;
     }
 
-    // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in the worst case
-    // (when x is multiple of 2/pi very close to an integer) so we need to get rid of these zeros
-    // and adjust the index of final result. So in the worst case, precision of comupted result is
-    // 90 bits (150 bits original bits - 60 lost in cancellation).
+    // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in
+    // the worst case (when x is multiple of 2/pi very close to an integer) so
+    // we need to get rid of these zeros and adjust the index of final result.
+    // So in the worst case, precision of comupted result is 90 bits (150 bits
+    // original bits - 60 lost in cancellation).
     int ind = 1;
-    for(i = 1; i < (DIGITS+2); i++) {
-        if(result[i] != 0)
+    for (i = 1; i < (DIGITS + 2); i++)
+    {
+        if (result[i] != 0)
             break;
         else
             ind++;
     }
 
-    uint64_t r[DIGITS-1];
-    for(i = 0; i < (DIGITS-1); i++) {
+    uint64_t r[DIGITS - 1];
+    for (i = 0; i < (DIGITS - 1); i++)
+    {
         r[i] = 0;
-        for(j = 0; j <= i; j++) {
-            r[i] += (result[ind+i-j] * (uint64_t) pi_over_two[j]);
+        for (j = 0; j <= i; j++)
+        {
+            r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]);
         }
     }
-    for(i = (DIGITS-2); i > 0; i--) {
+    for (i = (DIGITS - 2); i > 0; i--)
+    {
         tmp = r[i] >> RADIX;
         r[i - 1] += tmp;
         r[i] -= (tmp << RADIX);
@@ -3764,147 +4231,127 @@ static int payne_hanek( double *y, int *exception )
     r[0] -= (tmp << RADIX);
 
     eprep_t epr;
-    epr.sign = epx.sign*sign;
-    if(tmp != 0) {
+    epr.sign = epx.sign * sign;
+    if (tmp != 0)
+    {
         epr.index = -ind + 1;
-        epr.X[0] = (uint32_t) tmp;
-        epr.X[1] = (uint32_t) r[0];
-        epr.X[2] = (uint32_t) r[1];
+        epr.X[0] = (uint32_t)tmp;
+        epr.X[1] = (uint32_t)r[0];
+        epr.X[2] = (uint32_t)r[1];
     }
-    else {
+    else
+    {
         epr.index = -ind;
-        epr.X[0] = (uint32_t) r[0];
-        epr.X[1] = (uint32_t) r[1];
-        epr.X[2] = (uint32_t) r[2];
+        epr.X[0] = (uint32_t)r[0];
+        epr.X[1] = (uint32_t)r[1];
+        epr.X[2] = (uint32_t)r[2];
     }
 
-    *y = eprep_to_double( epr );
-    return epx.sign*N;
+    *y = eprep_to_double(epr);
+    return epx.sign * N;
 }
 
 double reference_relaxed_cos(double x)
 {
-  if(isnan(x))
-    return NAN;
-  return (float)cos((float)x);
+    if (isnan(x)) return NAN;
+    return (float)cos((float)x);
 }
 
 double reference_cos(double x)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return cos( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return cos(x);
     unsigned int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  cos( x );
-        case 1:
-            return -sin( x );
-        case 2:
-            return -cos( x );
-        case 3:
-            return  sin( x );
+    switch (c)
+    {
+        case 0: return cos(x);
+        case 1: return -sin(x);
+        case 2: return -cos(x);
+        case 3: return sin(x);
     }
     return 0.0;
 }
 
-double reference_relaxed_sin(double x){
-  return (float)sin((float)x);
-}
+double reference_relaxed_sin(double x) { return (float)sin((float)x); }
 
 double reference_sin(double x)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return sin( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return sin(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  sin( x );
-        case 1:
-            return  cos( x );
-        case 2:
-            return -sin( x );
-        case 3:
-            return -cos( x );
+    switch (c)
+    {
+        case 0: return sin(x);
+        case 1: return cos(x);
+        case 2: return -sin(x);
+        case 3: return -cos(x);
     }
     return 0.0;
 }
 
-double reference_relaxed_sincos(double x, double * y){
-  *y = reference_relaxed_cos(x);
-  return reference_relaxed_sin(x);
+double reference_relaxed_sincos(double x, double *y)
+{
+    *y = reference_relaxed_cos(x);
+    return reference_relaxed_sin(x);
 }
 
 double reference_sincos(double x, double *y)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception ) {
-        *y = cos( x );
-        return sin( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception)
+    {
+        *y = cos(x);
+        return sin(x);
     }
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            *y = cos( x );
-            return  sin( x );
-        case 1:
-            *y = -sin( x );
-            return  cos( x );
-        case 2:
-            *y = -cos( x );
-            return -sin( x );
-        case 3:
-            *y = sin( x );
-            return -cos( x );
+    switch (c)
+    {
+        case 0: *y = cos(x); return sin(x);
+        case 1: *y = -sin(x); return cos(x);
+        case 2: *y = -cos(x); return -sin(x);
+        case 3: *y = sin(x); return -cos(x);
     }
     return 0.0;
 }
 
-double reference_relaxed_tan(double x){
-  return ((float) reference_relaxed_sin((float)x))/((float) reference_relaxed_cos((float)x));
+double reference_relaxed_tan(double x)
+{
+    return ((float)reference_relaxed_sin((float)x))
+        / ((float)reference_relaxed_cos((float)x));
 }
 
 double reference_tan(double x)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return tan( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return tan(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  tan( x );
-        case 1:
-            return -1.0 / tan( x );
-        case 2:
-            return tan( x );
-        case 3:
-            return -1.0 / tan( x );
+    switch (c)
+    {
+        case 0: return tan(x);
+        case 1: return -1.0 / tan(x);
+        case 2: return tan(x);
+        case 3: return -1.0 / tan(x);
     }
     return 0.0;
 }
 
 long double reference_cosl(long double xx)
 {
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return cosl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return cosl(x);
     unsigned int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  cosl( x );
-        case 1:
-            return -sinl( x );
-        case 2:
-            return -cosl( x );
-        case 3:
-            return  sinl( x );
+    switch (c)
+    {
+        case 0: return cosl(x);
+        case 1: return -sinl(x);
+        case 2: return -cosl(x);
+        case 3: return sinl(x);
     }
     return 0.0;
 }
@@ -3913,25 +4360,20 @@ long double reference_sinl(long double xx)
 {
     // we use system tanl after reduction which
     // can flush denorm input to zero so
-    //take care of it here.
-    if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 ))
-        return xx;
+    // take care of it here.
+    if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
 
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return sinl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return sinl(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  sinl( x );
-        case 1:
-            return  cosl( x );
-        case 2:
-            return -sinl( x );
-        case 3:
-            return -cosl( x );
+    switch (c)
+    {
+        case 0: return sinl(x);
+        case 1: return cosl(x);
+        case 2: return -sinl(x);
+        case 3: return -cosl(x);
     }
     return 0.0;
 }
@@ -3940,34 +4382,28 @@ long double reference_sincosl(long double xx, long double *y)
 {
     // we use system tanl after reduction which
     // can flush denorm input to zero so
-    //take care of it here.
-    if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 ))
+    // take care of it here.
+    if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022))
     {
         *y = cosl(xx);
         return xx;
     }
 
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception ) {
-        *y = cosl( x );
-        return sinl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception)
+    {
+        *y = cosl(x);
+        return sinl(x);
     }
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            *y = cosl( x );
-            return  sinl( x );
-        case 1:
-            *y = -sinl( x );
-            return  cosl( x );
-        case 2:
-            *y = -cosl( x );
-            return -sinl( x );
-        case 3:
-            *y = sinl( x );
-            return -cosl( x );
+    switch (c)
+    {
+        case 0: *y = cosl(x); return sinl(x);
+        case 1: *y = -sinl(x); return cosl(x);
+        case 2: *y = -cosl(x); return -sinl(x);
+        case 3: *y = sinl(x); return -cosl(x);
     }
     return 0.0;
 }
@@ -3976,205 +4412,337 @@ long double reference_tanl(long double xx)
 {
     // we use system tanl after reduction which
     // can flush denorm input to zero so
-    //take care of it here.
-    if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 ))
-        return xx;
+    // take care of it here.
+    if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
 
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return tanl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return tanl(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  tanl( x );
-        case 1:
-            return -1.0 / tanl( x );
-        case 2:
-            return tanl( x );
-        case 3:
-            return -1.0 / tanl( x );
+    switch (c)
+    {
+        case 0: return tanl(x);
+        case 1: return -1.0 / tanl(x);
+        case 2: return tanl(x);
+        case 3: return -1.0 / tanl(x);
     }
     return 0.0;
 }
 
 static double __loglTable1[64][3] = {
-{HEX_DBL( +, 1, 5390948f40fea, +, 0 ), HEX_DBL( -, 1, a152f142a,  -, 2 ), HEX_DBL( +, 1, f93e27b43bd2c, -, 40 )},
-{HEX_DBL( +, 1, 5015015015015, +, 0 ), HEX_DBL( -, 1, 921800925,  -, 2 ), HEX_DBL( +, 1, 162432a1b8df7, -, 41 )},
-{HEX_DBL( +, 1, 4cab88725af6e, +, 0 ), HEX_DBL( -, 1, 8304d90c18, -, 2 ), HEX_DBL( +, 1, 80bb749056fe7, -, 40 )},
-{HEX_DBL( +, 1, 49539e3b2d066, +, 0 ), HEX_DBL( -, 1, 7418acebc,  -, 2 ), HEX_DBL( +, 1, ceac7f0607711, -, 43 )},
-{HEX_DBL( +, 1, 460cbc7f5cf9a, +, 0 ), HEX_DBL( -, 1, 6552b49988, -, 2 ), HEX_DBL( +, 1, d8913d0e89fa,  -, 42 )},
-{HEX_DBL( +, 1, 42d6625d51f86, +, 0 ), HEX_DBL( -, 1, 56b22e6b58, -, 2 ), HEX_DBL( +, 1, c7eaf515033a1, -, 44 )},
-{HEX_DBL( +, 1, 3fb013fb013fb, +, 0 ), HEX_DBL( -, 1, 48365e696,  -, 2 ), HEX_DBL( +, 1, 434adcde7edc7, -, 41 )},
-{HEX_DBL( +, 1, 3c995a47babe7, +, 0 ), HEX_DBL( -, 1, 39de8e156,  -, 2 ), HEX_DBL( +, 1, 8246f8e527754, -, 40 )},
-{HEX_DBL( +, 1, 3991c2c187f63, +, 0 ), HEX_DBL( -, 1, 2baa0c34c,  -, 2 ), HEX_DBL( +, 1, e1513c28e180d, -, 42 )},
-{HEX_DBL( +, 1, 3698df3de0747, +, 0 ), HEX_DBL( -, 1, 1d982c9d58, -, 2 ), HEX_DBL( +, 1, 63ea3fed4b8a2, -, 40 )},
-{HEX_DBL( +, 1, 33ae45b57bcb1, +, 0 ), HEX_DBL( -, 1, 0fa848045,  -, 2 ), HEX_DBL( +, 1, 32ccbacf1779b, -, 40 )},
-{HEX_DBL( +, 1, 30d190130d19,  +, 0 ), HEX_DBL( -, 1, 01d9bbcfa8, -, 2 ), HEX_DBL( +, 1, e2bfeb2b884aa, -, 42 )},
-{HEX_DBL( +, 1, 2e025c04b8097, +, 0 ), HEX_DBL( -, 1, e857d3d37,  -, 3 ), HEX_DBL( +, 1, d9309b4d2ea85, -, 40 )},
-{HEX_DBL( +, 1, 2b404ad012b4,  +, 0 ), HEX_DBL( -, 1, cd3c712d4,  -, 3 ), HEX_DBL( +, 1, ddf360962d7ab, -, 40 )},
-{HEX_DBL( +, 1, 288b01288b012, +, 0 ), HEX_DBL( -, 1, b2602497e,  -, 3 ), HEX_DBL( +, 1, 597f8a121640f, -, 40 )},
-{HEX_DBL( +, 1, 25e22708092f1, +, 0 ), HEX_DBL( -, 1, 97c1cb13d,  -, 3 ), HEX_DBL( +, 1, 02807d15580dc, -, 40 )},
-{HEX_DBL( +, 1, 23456789abcdf, +, 0 ), HEX_DBL( -, 1, 7d60496d,   -, 3 ), HEX_DBL( +, 1, 12ce913d7a827, -, 41 )},
-{HEX_DBL( +, 1, 20b470c67c0d8, +, 0 ), HEX_DBL( -, 1, 633a8bf44,  -, 3 ), HEX_DBL( +, 1, 0648bca9c96bd, -, 40 )},
-{HEX_DBL( +, 1, 1e2ef3b3fb874, +, 0 ), HEX_DBL( -, 1, 494f863b9,  -, 3 ), HEX_DBL( +, 1, 066fceb89b0eb, -, 42 )},
-{HEX_DBL( +, 1, 1bb4a4046ed29, +, 0 ), HEX_DBL( -, 1, 2f9e32d5c,  -, 3 ), HEX_DBL( +, 1, 17b8b6c4f846b, -, 46 )},
-{HEX_DBL( +, 1, 19453808ca29c, +, 0 ), HEX_DBL( -, 1, 162593187,  -, 3 ), HEX_DBL( +, 1, 2c83506452154, -, 42 )},
-{HEX_DBL( +, 1, 16e0689427378, +, 0 ), HEX_DBL( -, 1, f9c95dc1e,  -, 4 ), HEX_DBL( +, 1, dd5d2183150f3, -, 41 )},
-{HEX_DBL( +, 1, 1485f0e0acd3b, +, 0 ), HEX_DBL( -, 1, c7b528b72,  -, 4 ), HEX_DBL( +, 1, 0e43c4f4e619d, -, 40 )},
-{HEX_DBL( +, 1, 12358e75d3033, +, 0 ), HEX_DBL( -, 1, 960caf9ac,  -, 4 ), HEX_DBL( +, 1, 20fbfd5902a1e, -, 42 )},
-{HEX_DBL( +, 1, 0fef010fef01,  +, 0 ), HEX_DBL( -, 1, 64ce26c08,  -, 4 ), HEX_DBL( +, 1, 8ebeefb4ac467, -, 40 )},
-{HEX_DBL( +, 1, 0db20a88f4695, +, 0 ), HEX_DBL( -, 1, 33f7cde16,  -, 4 ), HEX_DBL( +, 1, 30b3312da7a7d, -, 40 )},
-{HEX_DBL( +, 1, 0b7e6ec259dc7, +, 0 ), HEX_DBL( -, 1, 0387efbcc,  -, 4 ), HEX_DBL( +, 1, 796f1632949c3, -, 40 )},
-{HEX_DBL( +, 1, 0953f39010953, +, 0 ), HEX_DBL( -, 1, a6f9c378,   -, 5 ), HEX_DBL( +, 1, 1687e151172cc, -, 40 )},
-{HEX_DBL( +, 1, 073260a47f7c6, +, 0 ), HEX_DBL( -, 1, 47aa07358,  -, 5 ), HEX_DBL( +, 1, 1f87e4a9cc778, -, 42 )},
-{HEX_DBL( +, 1, 05197f7d73404, +, 0 ), HEX_DBL( -, 1, d23afc498,  -, 6 ), HEX_DBL( +, 1, b183a6b628487, -, 40 )},
-{HEX_DBL( +, 1, 03091b51f5e1a, +, 0 ), HEX_DBL( -, 1, 16a21e21,   -, 6 ), HEX_DBL( +, 1, 7d75c58973ce5, -, 40 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,          +, 0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,          +, 0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, f44659e4a4271, -, 1 ), HEX_DBL( +, 1, 11cd1d51,   -, 5 ), HEX_DBL( +, 1, 9a0d857e2f4b2, -, 40 )},
-{HEX_DBL( +, 1, ecc07b301ecc,  -, 1 ), HEX_DBL( +, 1, c4dfab908,  -, 5 ), HEX_DBL( +, 1, 55b53fce557fd, -, 40 )},
-{HEX_DBL( +, 1, e573ac901e573, -, 1 ), HEX_DBL( +, 1, 3aa2fdd26,  -, 4 ), HEX_DBL( +, 1, f1cb0c9532089, -, 40 )},
-{HEX_DBL( +, 1, de5d6e3f8868a, -, 1 ), HEX_DBL( +, 1, 918a16e46,  -, 4 ), HEX_DBL( +, 1, 9af0dcd65a6e1, -, 43 )},
-{HEX_DBL( +, 1, d77b654b82c33, -, 1 ), HEX_DBL( +, 1, e72ec117e,  -, 4 ), HEX_DBL( +, 1, a5b93c4ebe124, -, 40 )},
-{HEX_DBL( +, 1, d0cb58f6ec074, -, 1 ), HEX_DBL( +, 1, 1dcd19755,  -, 3 ), HEX_DBL( +, 1, 5be50e71ddc6c, -, 42 )},
-{HEX_DBL( +, 1, ca4b3055ee191, -, 1 ), HEX_DBL( +, 1, 476a9f983,  -, 3 ), HEX_DBL( +, 1, ee9a798719e7f, -, 40 )},
-{HEX_DBL( +, 1, c3f8f01c3f8f,  -, 1 ), HEX_DBL( +, 1, 70742d4ef,  -, 3 ), HEX_DBL( +, 1, 3ff1352c1219c, -, 46 )},
-{HEX_DBL( +, 1, bdd2b899406f7, -, 1 ), HEX_DBL( +, 1, 98edd077e,  -, 3 ), HEX_DBL( +, 1, c383cd11362f4, -, 41 )},
-{HEX_DBL( +, 1, b7d6c3dda338b, -, 1 ), HEX_DBL( +, 1, c0db6cdd9,  -, 3 ), HEX_DBL( +, 1, 37bd85b1a824e, -, 41 )},
-{HEX_DBL( +, 1, b2036406c80d9, -, 1 ), HEX_DBL( +, 1, e840be74e,  -, 3 ), HEX_DBL( +, 1, a9334d525e1ec, -, 41 )},
-{HEX_DBL( +, 1, ac5701ac5701a, -, 1 ), HEX_DBL( +, 1, 0790adbb,   -, 2 ), HEX_DBL( +, 1, 8060bfb6a491,  -, 41 )},
-{HEX_DBL( +, 1, a6d01a6d01a6d, -, 1 ), HEX_DBL( +, 1, 1ac05b2918, -, 2 ), HEX_DBL( +, 1, c1c161471580a, -, 40 )},
-{HEX_DBL( +, 1, a16d3f97a4b01, -, 1 ), HEX_DBL( +, 1, 2db10fc4d8, -, 2 ), HEX_DBL( +, 1, ab1aa62214581, -, 42 )},
-{HEX_DBL( +, 1, 9c2d14ee4a101, -, 1 ), HEX_DBL( +, 1, 406463b1b,  -, 2 ), HEX_DBL( +, 1, 12e95dbda6611, -, 44 )},
-{HEX_DBL( +, 1, 970e4f80cb872, -, 1 ), HEX_DBL( +, 1, 52dbdfc4c8, -, 2 ), HEX_DBL( +, 1, 6b53fee511af,  -, 42 )},
-{HEX_DBL( +, 1, 920fb49d0e228, -, 1 ), HEX_DBL( +, 1, 6518fe467,  -, 2 ), HEX_DBL( +, 1, eea7d7d7d1764, -, 40 )},
-{HEX_DBL( +, 1, 8d3018d3018d3, -, 1 ), HEX_DBL( +, 1, 771d2ba7e8, -, 2 ), HEX_DBL( +, 1, ecefa8d4fab97, -, 40 )},
-{HEX_DBL( +, 1, 886e5f0abb049, -, 1 ), HEX_DBL( +, 1, 88e9c72e08, -, 2 ), HEX_DBL( +, 1, 913ea3d33fd14, -, 41 )},
-{HEX_DBL( +, 1, 83c977ab2bedd, -, 1 ), HEX_DBL( +, 1, 9a802391e,  -, 2 ), HEX_DBL( +, 1, 197e845877c94, -, 41 )},
-{HEX_DBL( +, 1, 7f405fd017f4,  -, 1 ), HEX_DBL( +, 1, abe18797f,  -, 2 ), HEX_DBL( +, 1, f4a52f8e8a81,  -, 42 )},
-{HEX_DBL( +, 1, 7ad2208e0ecc3, -, 1 ), HEX_DBL( +, 1, bd0f2e9e78, -, 2 ), HEX_DBL( +, 1, 031f4336644cc, -, 42 )},
-{HEX_DBL( +, 1, 767dce434a9b1, -, 1 ), HEX_DBL( +, 1, ce0a4923a,  -, 2 ), HEX_DBL( +, 1, 61f33c897020c, -, 40 )},
-{HEX_DBL( +, 1, 724287f46debc, -, 1 ), HEX_DBL( +, 1, ded3fd442,  -, 2 ), HEX_DBL( +, 1, b2632e830632,  -, 41 )},
-{HEX_DBL( +, 1, 6e1f76b4337c6, -, 1 ), HEX_DBL( +, 1, ef6d673288, -, 2 ), HEX_DBL( +, 1, 888ec245a0bf,  -, 40 )},
-{HEX_DBL( +, 1, 6a13cd153729,  -, 1 ), HEX_DBL( +, 1, ffd799a838, -, 2 ), HEX_DBL( +, 1, fe6f3b2f5fc8e, -, 40 )},
-{HEX_DBL( +, 1, 661ec6a5122f9, -, 1 ), HEX_DBL( +, 1, 0809cf27f4, -, 1 ), HEX_DBL( +, 1, 81eaa9ef284dd, -, 40 )},
-{HEX_DBL( +, 1, 623fa7701623f, -, 1 ), HEX_DBL( +, 1, 10113b153c, -, 1 ), HEX_DBL( +, 1, 1d7b07d6b1143, -, 42 )},
-{HEX_DBL( +, 1, 5e75bb8d015e7, -, 1 ), HEX_DBL( +, 1, 18028cf728, -, 1 ), HEX_DBL( +, 1, 76b100b1f6c6,  -, 41 )},
-{HEX_DBL( +, 1, 5ac056b015ac,  -, 1 ), HEX_DBL( +, 1, 1fde3d30e8, -, 1 ), HEX_DBL( +, 1, 26faeb9870945, -, 45 )},
-{HEX_DBL( +, 1, 571ed3c506b39, -, 1 ), HEX_DBL( +, 1, 27a4c0585c, -, 1 ), HEX_DBL( +, 1, 7f2c5344d762b, -, 42 )}
+    { HEX_DBL(+, 1, 5390948f40fea, +, 0), HEX_DBL(-, 1, a152f142a, -, 2),
+      HEX_DBL(+, 1, f93e27b43bd2c, -, 40) },
+    { HEX_DBL(+, 1, 5015015015015, +, 0), HEX_DBL(-, 1, 921800925, -, 2),
+      HEX_DBL(+, 1, 162432a1b8df7, -, 41) },
+    { HEX_DBL(+, 1, 4cab88725af6e, +, 0), HEX_DBL(-, 1, 8304d90c18, -, 2),
+      HEX_DBL(+, 1, 80bb749056fe7, -, 40) },
+    { HEX_DBL(+, 1, 49539e3b2d066, +, 0), HEX_DBL(-, 1, 7418acebc, -, 2),
+      HEX_DBL(+, 1, ceac7f0607711, -, 43) },
+    { HEX_DBL(+, 1, 460cbc7f5cf9a, +, 0), HEX_DBL(-, 1, 6552b49988, -, 2),
+      HEX_DBL(+, 1, d8913d0e89fa, -, 42) },
+    { HEX_DBL(+, 1, 42d6625d51f86, +, 0), HEX_DBL(-, 1, 56b22e6b58, -, 2),
+      HEX_DBL(+, 1, c7eaf515033a1, -, 44) },
+    { HEX_DBL(+, 1, 3fb013fb013fb, +, 0), HEX_DBL(-, 1, 48365e696, -, 2),
+      HEX_DBL(+, 1, 434adcde7edc7, -, 41) },
+    { HEX_DBL(+, 1, 3c995a47babe7, +, 0), HEX_DBL(-, 1, 39de8e156, -, 2),
+      HEX_DBL(+, 1, 8246f8e527754, -, 40) },
+    { HEX_DBL(+, 1, 3991c2c187f63, +, 0), HEX_DBL(-, 1, 2baa0c34c, -, 2),
+      HEX_DBL(+, 1, e1513c28e180d, -, 42) },
+    { HEX_DBL(+, 1, 3698df3de0747, +, 0), HEX_DBL(-, 1, 1d982c9d58, -, 2),
+      HEX_DBL(+, 1, 63ea3fed4b8a2, -, 40) },
+    { HEX_DBL(+, 1, 33ae45b57bcb1, +, 0), HEX_DBL(-, 1, 0fa848045, -, 2),
+      HEX_DBL(+, 1, 32ccbacf1779b, -, 40) },
+    { HEX_DBL(+, 1, 30d190130d19, +, 0), HEX_DBL(-, 1, 01d9bbcfa8, -, 2),
+      HEX_DBL(+, 1, e2bfeb2b884aa, -, 42) },
+    { HEX_DBL(+, 1, 2e025c04b8097, +, 0), HEX_DBL(-, 1, e857d3d37, -, 3),
+      HEX_DBL(+, 1, d9309b4d2ea85, -, 40) },
+    { HEX_DBL(+, 1, 2b404ad012b4, +, 0), HEX_DBL(-, 1, cd3c712d4, -, 3),
+      HEX_DBL(+, 1, ddf360962d7ab, -, 40) },
+    { HEX_DBL(+, 1, 288b01288b012, +, 0), HEX_DBL(-, 1, b2602497e, -, 3),
+      HEX_DBL(+, 1, 597f8a121640f, -, 40) },
+    { HEX_DBL(+, 1, 25e22708092f1, +, 0), HEX_DBL(-, 1, 97c1cb13d, -, 3),
+      HEX_DBL(+, 1, 02807d15580dc, -, 40) },
+    { HEX_DBL(+, 1, 23456789abcdf, +, 0), HEX_DBL(-, 1, 7d60496d, -, 3),
+      HEX_DBL(+, 1, 12ce913d7a827, -, 41) },
+    { HEX_DBL(+, 1, 20b470c67c0d8, +, 0), HEX_DBL(-, 1, 633a8bf44, -, 3),
+      HEX_DBL(+, 1, 0648bca9c96bd, -, 40) },
+    { HEX_DBL(+, 1, 1e2ef3b3fb874, +, 0), HEX_DBL(-, 1, 494f863b9, -, 3),
+      HEX_DBL(+, 1, 066fceb89b0eb, -, 42) },
+    { HEX_DBL(+, 1, 1bb4a4046ed29, +, 0), HEX_DBL(-, 1, 2f9e32d5c, -, 3),
+      HEX_DBL(+, 1, 17b8b6c4f846b, -, 46) },
+    { HEX_DBL(+, 1, 19453808ca29c, +, 0), HEX_DBL(-, 1, 162593187, -, 3),
+      HEX_DBL(+, 1, 2c83506452154, -, 42) },
+    { HEX_DBL(+, 1, 16e0689427378, +, 0), HEX_DBL(-, 1, f9c95dc1e, -, 4),
+      HEX_DBL(+, 1, dd5d2183150f3, -, 41) },
+    { HEX_DBL(+, 1, 1485f0e0acd3b, +, 0), HEX_DBL(-, 1, c7b528b72, -, 4),
+      HEX_DBL(+, 1, 0e43c4f4e619d, -, 40) },
+    { HEX_DBL(+, 1, 12358e75d3033, +, 0), HEX_DBL(-, 1, 960caf9ac, -, 4),
+      HEX_DBL(+, 1, 20fbfd5902a1e, -, 42) },
+    { HEX_DBL(+, 1, 0fef010fef01, +, 0), HEX_DBL(-, 1, 64ce26c08, -, 4),
+      HEX_DBL(+, 1, 8ebeefb4ac467, -, 40) },
+    { HEX_DBL(+, 1, 0db20a88f4695, +, 0), HEX_DBL(-, 1, 33f7cde16, -, 4),
+      HEX_DBL(+, 1, 30b3312da7a7d, -, 40) },
+    { HEX_DBL(+, 1, 0b7e6ec259dc7, +, 0), HEX_DBL(-, 1, 0387efbcc, -, 4),
+      HEX_DBL(+, 1, 796f1632949c3, -, 40) },
+    { HEX_DBL(+, 1, 0953f39010953, +, 0), HEX_DBL(-, 1, a6f9c378, -, 5),
+      HEX_DBL(+, 1, 1687e151172cc, -, 40) },
+    { HEX_DBL(+, 1, 073260a47f7c6, +, 0), HEX_DBL(-, 1, 47aa07358, -, 5),
+      HEX_DBL(+, 1, 1f87e4a9cc778, -, 42) },
+    { HEX_DBL(+, 1, 05197f7d73404, +, 0), HEX_DBL(-, 1, d23afc498, -, 6),
+      HEX_DBL(+, 1, b183a6b628487, -, 40) },
+    { HEX_DBL(+, 1, 03091b51f5e1a, +, 0), HEX_DBL(-, 1, 16a21e21, -, 6),
+      HEX_DBL(+, 1, 7d75c58973ce5, -, 40) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, f44659e4a4271, -, 1), HEX_DBL(+, 1, 11cd1d51, -, 5),
+      HEX_DBL(+, 1, 9a0d857e2f4b2, -, 40) },
+    { HEX_DBL(+, 1, ecc07b301ecc, -, 1), HEX_DBL(+, 1, c4dfab908, -, 5),
+      HEX_DBL(+, 1, 55b53fce557fd, -, 40) },
+    { HEX_DBL(+, 1, e573ac901e573, -, 1), HEX_DBL(+, 1, 3aa2fdd26, -, 4),
+      HEX_DBL(+, 1, f1cb0c9532089, -, 40) },
+    { HEX_DBL(+, 1, de5d6e3f8868a, -, 1), HEX_DBL(+, 1, 918a16e46, -, 4),
+      HEX_DBL(+, 1, 9af0dcd65a6e1, -, 43) },
+    { HEX_DBL(+, 1, d77b654b82c33, -, 1), HEX_DBL(+, 1, e72ec117e, -, 4),
+      HEX_DBL(+, 1, a5b93c4ebe124, -, 40) },
+    { HEX_DBL(+, 1, d0cb58f6ec074, -, 1), HEX_DBL(+, 1, 1dcd19755, -, 3),
+      HEX_DBL(+, 1, 5be50e71ddc6c, -, 42) },
+    { HEX_DBL(+, 1, ca4b3055ee191, -, 1), HEX_DBL(+, 1, 476a9f983, -, 3),
+      HEX_DBL(+, 1, ee9a798719e7f, -, 40) },
+    { HEX_DBL(+, 1, c3f8f01c3f8f, -, 1), HEX_DBL(+, 1, 70742d4ef, -, 3),
+      HEX_DBL(+, 1, 3ff1352c1219c, -, 46) },
+    { HEX_DBL(+, 1, bdd2b899406f7, -, 1), HEX_DBL(+, 1, 98edd077e, -, 3),
+      HEX_DBL(+, 1, c383cd11362f4, -, 41) },
+    { HEX_DBL(+, 1, b7d6c3dda338b, -, 1), HEX_DBL(+, 1, c0db6cdd9, -, 3),
+      HEX_DBL(+, 1, 37bd85b1a824e, -, 41) },
+    { HEX_DBL(+, 1, b2036406c80d9, -, 1), HEX_DBL(+, 1, e840be74e, -, 3),
+      HEX_DBL(+, 1, a9334d525e1ec, -, 41) },
+    { HEX_DBL(+, 1, ac5701ac5701a, -, 1), HEX_DBL(+, 1, 0790adbb, -, 2),
+      HEX_DBL(+, 1, 8060bfb6a491, -, 41) },
+    { HEX_DBL(+, 1, a6d01a6d01a6d, -, 1), HEX_DBL(+, 1, 1ac05b2918, -, 2),
+      HEX_DBL(+, 1, c1c161471580a, -, 40) },
+    { HEX_DBL(+, 1, a16d3f97a4b01, -, 1), HEX_DBL(+, 1, 2db10fc4d8, -, 2),
+      HEX_DBL(+, 1, ab1aa62214581, -, 42) },
+    { HEX_DBL(+, 1, 9c2d14ee4a101, -, 1), HEX_DBL(+, 1, 406463b1b, -, 2),
+      HEX_DBL(+, 1, 12e95dbda6611, -, 44) },
+    { HEX_DBL(+, 1, 970e4f80cb872, -, 1), HEX_DBL(+, 1, 52dbdfc4c8, -, 2),
+      HEX_DBL(+, 1, 6b53fee511af, -, 42) },
+    { HEX_DBL(+, 1, 920fb49d0e228, -, 1), HEX_DBL(+, 1, 6518fe467, -, 2),
+      HEX_DBL(+, 1, eea7d7d7d1764, -, 40) },
+    { HEX_DBL(+, 1, 8d3018d3018d3, -, 1), HEX_DBL(+, 1, 771d2ba7e8, -, 2),
+      HEX_DBL(+, 1, ecefa8d4fab97, -, 40) },
+    { HEX_DBL(+, 1, 886e5f0abb049, -, 1), HEX_DBL(+, 1, 88e9c72e08, -, 2),
+      HEX_DBL(+, 1, 913ea3d33fd14, -, 41) },
+    { HEX_DBL(+, 1, 83c977ab2bedd, -, 1), HEX_DBL(+, 1, 9a802391e, -, 2),
+      HEX_DBL(+, 1, 197e845877c94, -, 41) },
+    { HEX_DBL(+, 1, 7f405fd017f4, -, 1), HEX_DBL(+, 1, abe18797f, -, 2),
+      HEX_DBL(+, 1, f4a52f8e8a81, -, 42) },
+    { HEX_DBL(+, 1, 7ad2208e0ecc3, -, 1), HEX_DBL(+, 1, bd0f2e9e78, -, 2),
+      HEX_DBL(+, 1, 031f4336644cc, -, 42) },
+    { HEX_DBL(+, 1, 767dce434a9b1, -, 1), HEX_DBL(+, 1, ce0a4923a, -, 2),
+      HEX_DBL(+, 1, 61f33c897020c, -, 40) },
+    { HEX_DBL(+, 1, 724287f46debc, -, 1), HEX_DBL(+, 1, ded3fd442, -, 2),
+      HEX_DBL(+, 1, b2632e830632, -, 41) },
+    { HEX_DBL(+, 1, 6e1f76b4337c6, -, 1), HEX_DBL(+, 1, ef6d673288, -, 2),
+      HEX_DBL(+, 1, 888ec245a0bf, -, 40) },
+    { HEX_DBL(+, 1, 6a13cd153729, -, 1), HEX_DBL(+, 1, ffd799a838, -, 2),
+      HEX_DBL(+, 1, fe6f3b2f5fc8e, -, 40) },
+    { HEX_DBL(+, 1, 661ec6a5122f9, -, 1), HEX_DBL(+, 1, 0809cf27f4, -, 1),
+      HEX_DBL(+, 1, 81eaa9ef284dd, -, 40) },
+    { HEX_DBL(+, 1, 623fa7701623f, -, 1), HEX_DBL(+, 1, 10113b153c, -, 1),
+      HEX_DBL(+, 1, 1d7b07d6b1143, -, 42) },
+    { HEX_DBL(+, 1, 5e75bb8d015e7, -, 1), HEX_DBL(+, 1, 18028cf728, -, 1),
+      HEX_DBL(+, 1, 76b100b1f6c6, -, 41) },
+    { HEX_DBL(+, 1, 5ac056b015ac, -, 1), HEX_DBL(+, 1, 1fde3d30e8, -, 1),
+      HEX_DBL(+, 1, 26faeb9870945, -, 45) },
+    { HEX_DBL(+, 1, 571ed3c506b39, -, 1), HEX_DBL(+, 1, 27a4c0585c, -, 1),
+      HEX_DBL(+, 1, 7f2c5344d762b, -, 42) }
 };
 
 static double __loglTable2[64][3] = {
-{HEX_DBL( +, 1, 01fbe7f0a1be6, +, 0 ), HEX_DBL( -, 1, 6cf6ddd26112a, -,  7 ), HEX_DBL( +, 1, 0725e5755e314, -, 60 )},
-{HEX_DBL( +, 1, 01eba93a97b12, +, 0 ), HEX_DBL( -, 1, 6155b1d99f603, -,  7 ), HEX_DBL( +, 1, 4bcea073117f4, -, 60 )},
-{HEX_DBL( +, 1, 01db6c9029cd1, +, 0 ), HEX_DBL( -, 1, 55b54153137ff, -,  7 ), HEX_DBL( +, 1, 21e8faccad0ec, -, 61 )},
-{HEX_DBL( +, 1, 01cb31f0f534c, +, 0 ), HEX_DBL( -, 1, 4a158c27245bd, -,  7 ), HEX_DBL( +, 1, 1a5b7bfbf35d3, -, 60 )},
-{HEX_DBL( +, 1, 01baf95c9723c, +, 0 ), HEX_DBL( -, 1, 3e76923e3d678, -,  7 ), HEX_DBL( +, 1, eee400eb5fe34, -, 62 )},
-{HEX_DBL( +, 1, 01aac2d2acee6, +, 0 ), HEX_DBL( -, 1, 32d85380ce776, -,  7 ), HEX_DBL( +, 1, cbf7a513937bd, -, 61 )},
-{HEX_DBL( +, 1, 019a8e52d401e, +, 0 ), HEX_DBL( -, 1, 273acfd74be72, -,  7 ), HEX_DBL( +, 1, 5c64599efa5e6, -, 60 )},
-{HEX_DBL( +, 1, 018a5bdca9e42, +, 0 ), HEX_DBL( -, 1, 1b9e072a2e65,  -,  7 ), HEX_DBL( +, 1, 364180e0a5d37, -, 60 )},
-{HEX_DBL( +, 1, 017a2b6fcc33e, +, 0 ), HEX_DBL( -, 1, 1001f961f3243, -,  7 ), HEX_DBL( +, 1, 63d795746f216, -, 60 )},
-{HEX_DBL( +, 1, 0169fd0bd8a8a, +, 0 ), HEX_DBL( -, 1, 0466a6671bca4, -,  7 ), HEX_DBL( +, 1, 4c99ff1907435, -, 60 )},
-{HEX_DBL( +, 1, 0159d0b06d129, +, 0 ), HEX_DBL( -, 1, f1981c445cd05, -,  8 ), HEX_DBL( +, 1, 4bfff6366b723, -, 62 )},
-{HEX_DBL( +, 1, 0149a65d275a6, +, 0 ), HEX_DBL( -, 1, da6460f76ab8c, -,  8 ), HEX_DBL( +, 1, 9c5404f47589c, -, 61 )},
-{HEX_DBL( +, 1, 01397e11a581b, +, 0 ), HEX_DBL( -, 1, c3321ab87f4ef, -,  8 ), HEX_DBL( +, 1, c0da537429cea, -, 61 )},
-{HEX_DBL( +, 1, 012957cd85a28, +, 0 ), HEX_DBL( -, 1, ac014958c112c, -,  8 ), HEX_DBL( +, 1, 000c2a1b595e3, -, 64 )},
-{HEX_DBL( +, 1, 0119339065ef7, +, 0 ), HEX_DBL( -, 1, 94d1eca95f67a, -,  8 ), HEX_DBL( +, 1, d8d20b0564d5,  -, 61 )},
-{HEX_DBL( +, 1, 01091159e4b3d, +, 0 ), HEX_DBL( -, 1, 7da4047b92b3e, -,  8 ), HEX_DBL( +, 1, 6194a5d68cf2,  -, 66 )},
-{HEX_DBL( +, 1, 00f8f129a0535, +, 0 ), HEX_DBL( -, 1, 667790a09bf77, -,  8 ), HEX_DBL( +, 1, ca230e0bea645, -, 61 )},
-{HEX_DBL( +, 1, 00e8d2ff374a1, +, 0 ), HEX_DBL( -, 1, 4f4c90e9c4ead, -,  8 ), HEX_DBL( +, 1, 1de3e7f350c1,  -, 61 )},
-{HEX_DBL( +, 1, 00d8b6da482ce, +, 0 ), HEX_DBL( -, 1, 3823052860649, -,  8 ), HEX_DBL( +, 1, 5789b4c5891b8, -, 64 )},
-{HEX_DBL( +, 1, 00c89cba71a8c, +, 0 ), HEX_DBL( -, 1, 20faed2dc9a9e, -,  8 ), HEX_DBL( +, 1, 9e7c40f9839fd, -, 62 )},
-{HEX_DBL( +, 1, 00b8849f52834, +, 0 ), HEX_DBL( -, 1, 09d448cb65014, -,  8 ), HEX_DBL( +, 1, 387e3e9b6d02,  -, 62 )},
-{HEX_DBL( +, 1, 00a86e88899a4, +, 0 ), HEX_DBL( -, 1, e55e2fa53ebf1, -,  9 ), HEX_DBL( +, 1, cdaa71fddfddf, -, 62 )},
-{HEX_DBL( +, 1, 00985a75b5e3f, +, 0 ), HEX_DBL( -, 1, b716b429dce0f, -,  9 ), HEX_DBL( +, 1, 2f2af081367bf, -, 63 )},
-{HEX_DBL( +, 1, 00884866766ee, +, 0 ), HEX_DBL( -, 1, 88d21ec7a16d7, -,  9 ), HEX_DBL( +, 1, fb95c228d6f16, -, 62 )},
-{HEX_DBL( +, 1, 0078385a6a61d, +, 0 ), HEX_DBL( -, 1, 5a906f219a9e8, -,  9 ), HEX_DBL( +, 1, 18aff10a89f29, -, 64 )},
-{HEX_DBL( +, 1, 00682a5130fbe, +, 0 ), HEX_DBL( -, 1, 2c51a4dae87f1, -,  9 ), HEX_DBL( +, 1, bcc7e33ddde3,  -, 63 )},
-{HEX_DBL( +, 1, 00581e4a69944, +, 0 ), HEX_DBL( -, 1, fc2b7f2d782b1, -, 10 ), HEX_DBL( +, 1, fe3ef3300a9fa, -, 64 )},
-{HEX_DBL( +, 1, 00481445b39a8, +, 0 ), HEX_DBL( -, 1, 9fb97df0b0b83, -, 10 ), HEX_DBL( +, 1, 0d9a601f2f324, -, 65 )},
-{HEX_DBL( +, 1, 00380c42ae963, +, 0 ), HEX_DBL( -, 1, 434d4546227ae, -, 10 ), HEX_DBL( +, 1, 0b9b6a5868f33, -, 63 )},
-{HEX_DBL( +, 1, 00280640fa271, +, 0 ), HEX_DBL( -, 1, cdcda8e930c19, -, 11 ), HEX_DBL( +, 1, 3d424ab39f789, -, 64 )},
-{HEX_DBL( +, 1, 0018024036051, +, 0 ), HEX_DBL( -, 1, 150c558601261, -, 11 ), HEX_DBL( +, 1, 285bb90327a0f, -, 64 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, ffa011fca0a1e, -, 1 ), HEX_DBL( +, 1, 14e5640c4197b, -, 10 ), HEX_DBL( +, 1, 95728136ae401, -, 63 )},
-{HEX_DBL( +, 1, ff6031f064e07, -, 1 ), HEX_DBL( +, 1, cd61806bf532d, -, 10 ), HEX_DBL( +, 1, 568a4f35d8538, -, 63 )},
-{HEX_DBL( +, 1, ff2061d532b9c, -, 1 ), HEX_DBL( +, 1, 42e34af550eda, -,  9 ), HEX_DBL( +, 1, 8f69cee55fec,  -, 62 )},
-{HEX_DBL( +, 1, fee0a1a513253, -, 1 ), HEX_DBL( +, 1, 9f0a5523902ea, -,  9 ), HEX_DBL( +, 1, daec734b11615, -, 63 )},
-{HEX_DBL( +, 1, fea0f15a12139, -, 1 ), HEX_DBL( +, 1, fb25e19f11b26, -,  9 ), HEX_DBL( +, 1, 8bafca62941da, -, 62 )},
-{HEX_DBL( +, 1, fe6150ee3e6d4, -, 1 ), HEX_DBL( +, 1, 2b9af9a28e282, -,  8 ), HEX_DBL( +, 1, 0fd3674e1dc5b, -, 61 )},
-{HEX_DBL( +, 1, fe21c05baa109, -, 1 ), HEX_DBL( +, 1, 599d4678f24b9, -,  8 ), HEX_DBL( +, 1, dafce1f09937b, -, 61 )},
-{HEX_DBL( +, 1, fde23f9c69cf9, -, 1 ), HEX_DBL( +, 1, 8799d8c046eb,  -,  8 ), HEX_DBL( +, 1, ffa0ce0bdd217, -, 65 )},
-{HEX_DBL( +, 1, fda2ceaa956e8, -, 1 ), HEX_DBL( +, 1, b590b1e5951ee, -,  8 ), HEX_DBL( +, 1, 645a769232446, -, 62 )},
-{HEX_DBL( +, 1, fd636d8047a1f, -, 1 ), HEX_DBL( +, 1, e381d3555dbcf, -,  8 ), HEX_DBL( +, 1, 882320d368331, -, 61 )},
-{HEX_DBL( +, 1, fd241c179e0cc, -, 1 ), HEX_DBL( +, 1, 08b69f3dccde,  -,  7 ), HEX_DBL( +, 1, 01ad5065aba9e, -, 61 )},
-{HEX_DBL( +, 1, fce4da6ab93e8, -, 1 ), HEX_DBL( +, 1, 1fa97a61dd298, -,  7 ), HEX_DBL( +, 1, 84cd1f931ae34, -, 60 )},
-{HEX_DBL( +, 1, fca5a873bcb19, -, 1 ), HEX_DBL( +, 1, 36997bcc54a3f, -,  7 ), HEX_DBL( +, 1, 1485e97eaee03, -, 60 )},
-{HEX_DBL( +, 1, fc66862ccec93, -, 1 ), HEX_DBL( +, 1, 4d86a43264a4f, -,  7 ), HEX_DBL( +, 1, c75e63370988b, -, 61 )},
-{HEX_DBL( +, 1, fc27739018cfe, -, 1 ), HEX_DBL( +, 1, 6470f448fb09d, -,  7 ), HEX_DBL( +, 1, d7361eeaed0a1, -, 65 )},
-{HEX_DBL( +, 1, fbe87097c6f5a, -, 1 ), HEX_DBL( +, 1, 7b586cc4c2523, -,  7 ), HEX_DBL( +, 1, b3df952cc473c, -, 61 )},
-{HEX_DBL( +, 1, fba97d3e084dd, -, 1 ), HEX_DBL( +, 1, 923d0e5a21e06, -,  7 ), HEX_DBL( +, 1, cf56c7b64ae5d, -, 62 )},
-{HEX_DBL( +, 1, fb6a997d0ecdc, -, 1 ), HEX_DBL( +, 1, a91ed9bd3df9a, -,  7 ), HEX_DBL( +, 1, b957bdcd89e43, -, 61 )},
-{HEX_DBL( +, 1, fb2bc54f0f4ab, -, 1 ), HEX_DBL( +, 1, bffdcfa1f7fbb, -,  7 ), HEX_DBL( +, 1, ea8cad9a21771, -, 62 )},
-{HEX_DBL( +, 1, faed00ae41783, -, 1 ), HEX_DBL( +, 1, d6d9f0bbee6f6, -,  7 ), HEX_DBL( +, 1, 5762a9af89c82, -, 60 )},
-{HEX_DBL( +, 1, faae4b94dfe64, -, 1 ), HEX_DBL( +, 1, edb33dbe7d335, -,  7 ), HEX_DBL( +, 1, 21e24fc245697, -, 62 )},
-{HEX_DBL( +, 1, fa6fa5fd27ff8, -, 1 ), HEX_DBL( +, 1, 0244dbae5ed05, -,  6 ), HEX_DBL( +, 1, 12ef51b967102, -, 60 )},
-{HEX_DBL( +, 1, fa310fe15a078, -, 1 ), HEX_DBL( +, 1, 0daeaf24c3529, -,  6 ), HEX_DBL( +, 1, 10d3cfca60b45, -, 59 )},
-{HEX_DBL( +, 1, f9f2893bb9192, -, 1 ), HEX_DBL( +, 1, 1917199bb66bc, -,  6 ), HEX_DBL( +, 1, 6cf6034c32e19, -, 60 )},
-{HEX_DBL( +, 1, f9b412068b247, -, 1 ), HEX_DBL( +, 1, 247e1b6c615d5, -,  6 ), HEX_DBL( +, 1, 42f0fffa229f7, -, 61 )},
-{HEX_DBL( +, 1, f975aa3c18ed6, -, 1 ), HEX_DBL( +, 1, 2fe3b4efcc5ad, -,  6 ), HEX_DBL( +, 1, 70106136a8919, -, 60 )},
-{HEX_DBL( +, 1, f93751d6ae09b, -, 1 ), HEX_DBL( +, 1, 3b47e67edea93, -,  6 ), HEX_DBL( +, 1, 38dd5a4f6959a, -, 59 )},
-{HEX_DBL( +, 1, f8f908d098df6, -, 1 ), HEX_DBL( +, 1, 46aab0725ea6c, -,  6 ), HEX_DBL( +, 1, 821fc1e799e01, -, 60 )},
-{HEX_DBL( +, 1, f8bacf242aa2c, -, 1 ), HEX_DBL( +, 1, 520c1322f1e4e, -,  6 ), HEX_DBL( +, 1, 129dcda3ad563, -, 60 )},
-{HEX_DBL( +, 1, f87ca4cbb755,  -, 1 ), HEX_DBL( +, 1, 5d6c0ee91d2ab, -,  6 ), HEX_DBL( +, 1, c5b190c04606e, -, 62 )},
-{HEX_DBL( +, 1, f83e89c195c25, -, 1 ), HEX_DBL( +, 1, 68caa41d448c3, -,  6 ), HEX_DBL( +, 1, 4723441195ac9, -, 59 )}
+    { HEX_DBL(+, 1, 01fbe7f0a1be6, +, 0), HEX_DBL(-, 1, 6cf6ddd26112a, -, 7),
+      HEX_DBL(+, 1, 0725e5755e314, -, 60) },
+    { HEX_DBL(+, 1, 01eba93a97b12, +, 0), HEX_DBL(-, 1, 6155b1d99f603, -, 7),
+      HEX_DBL(+, 1, 4bcea073117f4, -, 60) },
+    { HEX_DBL(+, 1, 01db6c9029cd1, +, 0), HEX_DBL(-, 1, 55b54153137ff, -, 7),
+      HEX_DBL(+, 1, 21e8faccad0ec, -, 61) },
+    { HEX_DBL(+, 1, 01cb31f0f534c, +, 0), HEX_DBL(-, 1, 4a158c27245bd, -, 7),
+      HEX_DBL(+, 1, 1a5b7bfbf35d3, -, 60) },
+    { HEX_DBL(+, 1, 01baf95c9723c, +, 0), HEX_DBL(-, 1, 3e76923e3d678, -, 7),
+      HEX_DBL(+, 1, eee400eb5fe34, -, 62) },
+    { HEX_DBL(+, 1, 01aac2d2acee6, +, 0), HEX_DBL(-, 1, 32d85380ce776, -, 7),
+      HEX_DBL(+, 1, cbf7a513937bd, -, 61) },
+    { HEX_DBL(+, 1, 019a8e52d401e, +, 0), HEX_DBL(-, 1, 273acfd74be72, -, 7),
+      HEX_DBL(+, 1, 5c64599efa5e6, -, 60) },
+    { HEX_DBL(+, 1, 018a5bdca9e42, +, 0), HEX_DBL(-, 1, 1b9e072a2e65, -, 7),
+      HEX_DBL(+, 1, 364180e0a5d37, -, 60) },
+    { HEX_DBL(+, 1, 017a2b6fcc33e, +, 0), HEX_DBL(-, 1, 1001f961f3243, -, 7),
+      HEX_DBL(+, 1, 63d795746f216, -, 60) },
+    { HEX_DBL(+, 1, 0169fd0bd8a8a, +, 0), HEX_DBL(-, 1, 0466a6671bca4, -, 7),
+      HEX_DBL(+, 1, 4c99ff1907435, -, 60) },
+    { HEX_DBL(+, 1, 0159d0b06d129, +, 0), HEX_DBL(-, 1, f1981c445cd05, -, 8),
+      HEX_DBL(+, 1, 4bfff6366b723, -, 62) },
+    { HEX_DBL(+, 1, 0149a65d275a6, +, 0), HEX_DBL(-, 1, da6460f76ab8c, -, 8),
+      HEX_DBL(+, 1, 9c5404f47589c, -, 61) },
+    { HEX_DBL(+, 1, 01397e11a581b, +, 0), HEX_DBL(-, 1, c3321ab87f4ef, -, 8),
+      HEX_DBL(+, 1, c0da537429cea, -, 61) },
+    { HEX_DBL(+, 1, 012957cd85a28, +, 0), HEX_DBL(-, 1, ac014958c112c, -, 8),
+      HEX_DBL(+, 1, 000c2a1b595e3, -, 64) },
+    { HEX_DBL(+, 1, 0119339065ef7, +, 0), HEX_DBL(-, 1, 94d1eca95f67a, -, 8),
+      HEX_DBL(+, 1, d8d20b0564d5, -, 61) },
+    { HEX_DBL(+, 1, 01091159e4b3d, +, 0), HEX_DBL(-, 1, 7da4047b92b3e, -, 8),
+      HEX_DBL(+, 1, 6194a5d68cf2, -, 66) },
+    { HEX_DBL(+, 1, 00f8f129a0535, +, 0), HEX_DBL(-, 1, 667790a09bf77, -, 8),
+      HEX_DBL(+, 1, ca230e0bea645, -, 61) },
+    { HEX_DBL(+, 1, 00e8d2ff374a1, +, 0), HEX_DBL(-, 1, 4f4c90e9c4ead, -, 8),
+      HEX_DBL(+, 1, 1de3e7f350c1, -, 61) },
+    { HEX_DBL(+, 1, 00d8b6da482ce, +, 0), HEX_DBL(-, 1, 3823052860649, -, 8),
+      HEX_DBL(+, 1, 5789b4c5891b8, -, 64) },
+    { HEX_DBL(+, 1, 00c89cba71a8c, +, 0), HEX_DBL(-, 1, 20faed2dc9a9e, -, 8),
+      HEX_DBL(+, 1, 9e7c40f9839fd, -, 62) },
+    { HEX_DBL(+, 1, 00b8849f52834, +, 0), HEX_DBL(-, 1, 09d448cb65014, -, 8),
+      HEX_DBL(+, 1, 387e3e9b6d02, -, 62) },
+    { HEX_DBL(+, 1, 00a86e88899a4, +, 0), HEX_DBL(-, 1, e55e2fa53ebf1, -, 9),
+      HEX_DBL(+, 1, cdaa71fddfddf, -, 62) },
+    { HEX_DBL(+, 1, 00985a75b5e3f, +, 0), HEX_DBL(-, 1, b716b429dce0f, -, 9),
+      HEX_DBL(+, 1, 2f2af081367bf, -, 63) },
+    { HEX_DBL(+, 1, 00884866766ee, +, 0), HEX_DBL(-, 1, 88d21ec7a16d7, -, 9),
+      HEX_DBL(+, 1, fb95c228d6f16, -, 62) },
+    { HEX_DBL(+, 1, 0078385a6a61d, +, 0), HEX_DBL(-, 1, 5a906f219a9e8, -, 9),
+      HEX_DBL(+, 1, 18aff10a89f29, -, 64) },
+    { HEX_DBL(+, 1, 00682a5130fbe, +, 0), HEX_DBL(-, 1, 2c51a4dae87f1, -, 9),
+      HEX_DBL(+, 1, bcc7e33ddde3, -, 63) },
+    { HEX_DBL(+, 1, 00581e4a69944, +, 0), HEX_DBL(-, 1, fc2b7f2d782b1, -, 10),
+      HEX_DBL(+, 1, fe3ef3300a9fa, -, 64) },
+    { HEX_DBL(+, 1, 00481445b39a8, +, 0), HEX_DBL(-, 1, 9fb97df0b0b83, -, 10),
+      HEX_DBL(+, 1, 0d9a601f2f324, -, 65) },
+    { HEX_DBL(+, 1, 00380c42ae963, +, 0), HEX_DBL(-, 1, 434d4546227ae, -, 10),
+      HEX_DBL(+, 1, 0b9b6a5868f33, -, 63) },
+    { HEX_DBL(+, 1, 00280640fa271, +, 0), HEX_DBL(-, 1, cdcda8e930c19, -, 11),
+      HEX_DBL(+, 1, 3d424ab39f789, -, 64) },
+    { HEX_DBL(+, 1, 0018024036051, +, 0), HEX_DBL(-, 1, 150c558601261, -, 11),
+      HEX_DBL(+, 1, 285bb90327a0f, -, 64) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, ffa011fca0a1e, -, 1), HEX_DBL(+, 1, 14e5640c4197b, -, 10),
+      HEX_DBL(+, 1, 95728136ae401, -, 63) },
+    { HEX_DBL(+, 1, ff6031f064e07, -, 1), HEX_DBL(+, 1, cd61806bf532d, -, 10),
+      HEX_DBL(+, 1, 568a4f35d8538, -, 63) },
+    { HEX_DBL(+, 1, ff2061d532b9c, -, 1), HEX_DBL(+, 1, 42e34af550eda, -, 9),
+      HEX_DBL(+, 1, 8f69cee55fec, -, 62) },
+    { HEX_DBL(+, 1, fee0a1a513253, -, 1), HEX_DBL(+, 1, 9f0a5523902ea, -, 9),
+      HEX_DBL(+, 1, daec734b11615, -, 63) },
+    { HEX_DBL(+, 1, fea0f15a12139, -, 1), HEX_DBL(+, 1, fb25e19f11b26, -, 9),
+      HEX_DBL(+, 1, 8bafca62941da, -, 62) },
+    { HEX_DBL(+, 1, fe6150ee3e6d4, -, 1), HEX_DBL(+, 1, 2b9af9a28e282, -, 8),
+      HEX_DBL(+, 1, 0fd3674e1dc5b, -, 61) },
+    { HEX_DBL(+, 1, fe21c05baa109, -, 1), HEX_DBL(+, 1, 599d4678f24b9, -, 8),
+      HEX_DBL(+, 1, dafce1f09937b, -, 61) },
+    { HEX_DBL(+, 1, fde23f9c69cf9, -, 1), HEX_DBL(+, 1, 8799d8c046eb, -, 8),
+      HEX_DBL(+, 1, ffa0ce0bdd217, -, 65) },
+    { HEX_DBL(+, 1, fda2ceaa956e8, -, 1), HEX_DBL(+, 1, b590b1e5951ee, -, 8),
+      HEX_DBL(+, 1, 645a769232446, -, 62) },
+    { HEX_DBL(+, 1, fd636d8047a1f, -, 1), HEX_DBL(+, 1, e381d3555dbcf, -, 8),
+      HEX_DBL(+, 1, 882320d368331, -, 61) },
+    { HEX_DBL(+, 1, fd241c179e0cc, -, 1), HEX_DBL(+, 1, 08b69f3dccde, -, 7),
+      HEX_DBL(+, 1, 01ad5065aba9e, -, 61) },
+    { HEX_DBL(+, 1, fce4da6ab93e8, -, 1), HEX_DBL(+, 1, 1fa97a61dd298, -, 7),
+      HEX_DBL(+, 1, 84cd1f931ae34, -, 60) },
+    { HEX_DBL(+, 1, fca5a873bcb19, -, 1), HEX_DBL(+, 1, 36997bcc54a3f, -, 7),
+      HEX_DBL(+, 1, 1485e97eaee03, -, 60) },
+    { HEX_DBL(+, 1, fc66862ccec93, -, 1), HEX_DBL(+, 1, 4d86a43264a4f, -, 7),
+      HEX_DBL(+, 1, c75e63370988b, -, 61) },
+    { HEX_DBL(+, 1, fc27739018cfe, -, 1), HEX_DBL(+, 1, 6470f448fb09d, -, 7),
+      HEX_DBL(+, 1, d7361eeaed0a1, -, 65) },
+    { HEX_DBL(+, 1, fbe87097c6f5a, -, 1), HEX_DBL(+, 1, 7b586cc4c2523, -, 7),
+      HEX_DBL(+, 1, b3df952cc473c, -, 61) },
+    { HEX_DBL(+, 1, fba97d3e084dd, -, 1), HEX_DBL(+, 1, 923d0e5a21e06, -, 7),
+      HEX_DBL(+, 1, cf56c7b64ae5d, -, 62) },
+    { HEX_DBL(+, 1, fb6a997d0ecdc, -, 1), HEX_DBL(+, 1, a91ed9bd3df9a, -, 7),
+      HEX_DBL(+, 1, b957bdcd89e43, -, 61) },
+    { HEX_DBL(+, 1, fb2bc54f0f4ab, -, 1), HEX_DBL(+, 1, bffdcfa1f7fbb, -, 7),
+      HEX_DBL(+, 1, ea8cad9a21771, -, 62) },
+    { HEX_DBL(+, 1, faed00ae41783, -, 1), HEX_DBL(+, 1, d6d9f0bbee6f6, -, 7),
+      HEX_DBL(+, 1, 5762a9af89c82, -, 60) },
+    { HEX_DBL(+, 1, faae4b94dfe64, -, 1), HEX_DBL(+, 1, edb33dbe7d335, -, 7),
+      HEX_DBL(+, 1, 21e24fc245697, -, 62) },
+    { HEX_DBL(+, 1, fa6fa5fd27ff8, -, 1), HEX_DBL(+, 1, 0244dbae5ed05, -, 6),
+      HEX_DBL(+, 1, 12ef51b967102, -, 60) },
+    { HEX_DBL(+, 1, fa310fe15a078, -, 1), HEX_DBL(+, 1, 0daeaf24c3529, -, 6),
+      HEX_DBL(+, 1, 10d3cfca60b45, -, 59) },
+    { HEX_DBL(+, 1, f9f2893bb9192, -, 1), HEX_DBL(+, 1, 1917199bb66bc, -, 6),
+      HEX_DBL(+, 1, 6cf6034c32e19, -, 60) },
+    { HEX_DBL(+, 1, f9b412068b247, -, 1), HEX_DBL(+, 1, 247e1b6c615d5, -, 6),
+      HEX_DBL(+, 1, 42f0fffa229f7, -, 61) },
+    { HEX_DBL(+, 1, f975aa3c18ed6, -, 1), HEX_DBL(+, 1, 2fe3b4efcc5ad, -, 6),
+      HEX_DBL(+, 1, 70106136a8919, -, 60) },
+    { HEX_DBL(+, 1, f93751d6ae09b, -, 1), HEX_DBL(+, 1, 3b47e67edea93, -, 6),
+      HEX_DBL(+, 1, 38dd5a4f6959a, -, 59) },
+    { HEX_DBL(+, 1, f8f908d098df6, -, 1), HEX_DBL(+, 1, 46aab0725ea6c, -, 6),
+      HEX_DBL(+, 1, 821fc1e799e01, -, 60) },
+    { HEX_DBL(+, 1, f8bacf242aa2c, -, 1), HEX_DBL(+, 1, 520c1322f1e4e, -, 6),
+      HEX_DBL(+, 1, 129dcda3ad563, -, 60) },
+    { HEX_DBL(+, 1, f87ca4cbb755, -, 1), HEX_DBL(+, 1, 5d6c0ee91d2ab, -, 6),
+      HEX_DBL(+, 1, c5b190c04606e, -, 62) },
+    { HEX_DBL(+, 1, f83e89c195c25, -, 1), HEX_DBL(+, 1, 68caa41d448c3, -, 6),
+      HEX_DBL(+, 1, 4723441195ac9, -, 59) }
 };
 
 static double __loglTable3[8][3] = {
-{HEX_DBL( +, 1, 000e00c40ab89, +, 0 ), HEX_DBL( -, 1, 4332be0032168, -, 12 ), HEX_DBL( +, 1, a1003588d217a, -, 65 )},
-{HEX_DBL( +, 1, 000a006403e82, +, 0 ), HEX_DBL( -, 1, cdb2987366fcc, -, 13 ), HEX_DBL( +, 1, 5c86001294bbc, -, 67 )},
-{HEX_DBL( +, 1, 0006002400d8,  +, 0 ), HEX_DBL( -, 1, 150297c90fa6f, -, 13 ), HEX_DBL( +, 1, 01fb4865fae32, -, 66 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, ffe8011ff280a, -, 1 ), HEX_DBL( +, 1, 14f8daf5e3d3b, -, 12 ), HEX_DBL( +, 1, 3c933b4b6b914, -, 68 )},
-{HEX_DBL( +, 1, ffd8031fc184e, -, 1 ), HEX_DBL( +, 1, cd978c38042bb, -, 12 ), HEX_DBL( +, 1, 10f8e642e66fd, -, 65 )},
-{HEX_DBL( +, 1, ffc8061f5492b, -, 1 ), HEX_DBL( +, 1, 43183c878274e, -, 11 ), HEX_DBL( +, 1, 5885dd1eb6582, -, 65 )}
+    { HEX_DBL(+, 1, 000e00c40ab89, +, 0), HEX_DBL(-, 1, 4332be0032168, -, 12),
+      HEX_DBL(+, 1, a1003588d217a, -, 65) },
+    { HEX_DBL(+, 1, 000a006403e82, +, 0), HEX_DBL(-, 1, cdb2987366fcc, -, 13),
+      HEX_DBL(+, 1, 5c86001294bbc, -, 67) },
+    { HEX_DBL(+, 1, 0006002400d8, +, 0), HEX_DBL(-, 1, 150297c90fa6f, -, 13),
+      HEX_DBL(+, 1, 01fb4865fae32, -, 66) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, ffe8011ff280a, -, 1), HEX_DBL(+, 1, 14f8daf5e3d3b, -, 12),
+      HEX_DBL(+, 1, 3c933b4b6b914, -, 68) },
+    { HEX_DBL(+, 1, ffd8031fc184e, -, 1), HEX_DBL(+, 1, cd978c38042bb, -, 12),
+      HEX_DBL(+, 1, 10f8e642e66fd, -, 65) },
+    { HEX_DBL(+, 1, ffc8061f5492b, -, 1), HEX_DBL(+, 1, 43183c878274e, -, 11),
+      HEX_DBL(+, 1, 5885dd1eb6582, -, 65) }
 };
 
 static void __log2_ep(double *hi, double *lo, double x)
 {
-    union { uint64_t i; double d; } uu;
+    union {
+        uint64_t i;
+        double d;
+    } uu;
 
     int m;
     double f = reference_frexp(x, &m);
 
     // bring f in [0.75, 1.5)
-    if( f < 0.75 ) {
+    if (f < 0.75)
+    {
         f *= 2.0;
         m -= 1;
     }
 
     // index first table .... brings down to [1-2^-7, 1+2^6)
     uu.d = f;
-    int index = (int) (((uu.i + ((uint64_t) 1 << 51)) & 0x000fc00000000000ULL) >> 46);
+    int index =
+        (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46);
     double r1 = __loglTable1[index][0];
     double logr1hi = __loglTable1[index][1];
     double logr1lo = __loglTable1[index][2];
-    // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since |m| <= 1023
-    // which needs 10bits at max, we can directly add m to log1hi without spilling
+    // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since
+    // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi
+    // without spilling
     logr1hi += m;
 
-    // argument reduction needs to be in double-double since reduced argument will form the
-    // leading term of polynomial approximation which sets the precision we eventually achieve
+    // argument reduction needs to be in double-double since reduced argument
+    // will form the leading term of polynomial approximation which sets the
+    // precision we eventually achieve
     double zhi, zlo;
     MulD(&zhi, &zlo, r1, uu.d);
 
     // second index table .... brings down to [1-2^-12, 1+2^-11)
     uu.d = zhi;
-    index = (int) (((uu.i + ((uint64_t) 1 << 46)) & 0x00007e0000000000ULL) >> 41);
+    index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41);
     double r2 = __loglTable2[index][0];
     double logr2hi = __loglTable2[index][1];
     double logr2lo = __loglTable2[index][2];
@@ -4186,11 +4754,12 @@ static void __log2_ep(double *hi, double *lo, double x)
     // Actually reduction to 2^-11 would have been sufficient to calculate
     // second order term in polynomial in double rather than double-double, I
     // reduced it a bit more to make sure other systematic arithmetic errors
-    // are guarded against .... also this allow lower order product of leading polynomial
-    // term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather than double-double ...
-    // hence only term that needs to be done in double-double is Ao_hi*z_hi
+    // are guarded against .... also this allow lower order product of leading
+    // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather
+    // than double-double ... hence only term that needs to be done in
+    // double-double is Ao_hi*z_hi
     uu.d = zhi;
-    index = (int) (((uu.i + ((uint64_t) 1 << 41)) & 0x0000038000000000ULL) >> 39);
+    index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39);
     double r3 = __loglTable3[index][0];
     double logr3hi = __loglTable3[index][1];
     double logr3lo = __loglTable3[index][2];
@@ -4202,34 +4771,36 @@ static void __log2_ep(double *hi, double *lo, double x)
     AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo);
     AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo);
 
-    // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after this
+    // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after
+    // this
     MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0);
-    // we dont need to do full double-double substract here. substracting 1.0 for higher
-    // term is exact
+    // we dont need to do full double-double substract here. substracting 1.0
+    // for higher term is exact
     zhi = zhi - 1.0;
     // normalize
     AddD(&zhi, &zlo, zhi, zlo);
 
     // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit
-    // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in [-2^-14, 2^-13]
-    // log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
+    // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in
+    // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
     // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5
-    // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 + A4*z^3)
-    // since we are looking for at least 64 digits of precision and z in [-2^-14, 2^-13], final term
-    // can be done in double .... also Aolo*zhi + Aohi*zlo can be done in double ....
-    // Aohi*zhi needs to be done in double-double
-
-    double Aohi = HEX_DBL( +, 1, 71547652b82fe, +, 0 );
-    double Aolo = HEX_DBL( +, 1, 777c9cbb675c, -, 56 );
+    // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 +
+    // A4*z^3) since we are looking for at least 64 digits of precision and z in
+    // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi +
+    // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in
+    // double-double
+
+    double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0);
+    double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56);
     double y;
-    y = HEX_DBL( +, 1, 276d2736fade7, -, 2 );
-    y = HEX_DBL( -, 1, 7154765782df1, -, 2 ) + y*zhi;
-    y = HEX_DBL( +, 1, ec709dc3a0f67, -, 2 ) + y*zhi;
-    y = HEX_DBL( -, 1, 71547652b82fe, -, 1 ) + y*zhi;
-    double zhisq = zhi*zhi;
-    y = y*zhisq;
-    y = y + zhi*Aolo;
-    y = y + zlo*Aohi;
+    y = HEX_DBL(+, 1, 276d2736fade7, -, 2);
+    y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi;
+    y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi;
+    y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi;
+    double zhisq = zhi * zhi;
+    y = y * zhisq;
+    y = y + zhi * Aolo;
+    y = y + zlo * Aohi;
 
     MulD(&zhi, &zlo, Aohi, zhi);
     AddDD(&zhi, &zlo, zhi, zlo, y, 0.0);
@@ -4239,7 +4810,7 @@ static void __log2_ep(double *hi, double *lo, double x)
     *lo = zlo;
 }
 
-long double reference_powl( long double x, long double y )
+long double reference_powl(long double x, long double y)
 {
 
 
@@ -4256,174 +4827,166 @@ long double reference_powl( long double x, long double y )
     // causes errors. So we need to tread y as long double and convert it
     // to hi, lo doubles when performing y*log2(x).
 
-//    double x = (double) xx;
-//    double y = (double) yy;
+    //    double x = (double) xx;
+    //    double y = (double) yy;
 
-    static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 );
+    static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
 
-    //if x = 1, return x for any y, even NaN
-    if( x == 1.0 )
-        return x;
+    // if x = 1, return x for any y, even NaN
+    if (x == 1.0) return x;
 
-    //if y == 0, return 1 for any x, even NaN
-    if( y == 0.0 )
-        return 1.0L;
+    // if y == 0, return 1 for any x, even NaN
+    if (y == 0.0) return 1.0L;
 
-    //get NaNs out of the way
-    if( x != x  || y != y )
-        return x + y;
+    // get NaNs out of the way
+    if (x != x || y != y) return x + y;
 
-    //do the work required to sort out edge cases
-    double fabsy = reference_fabs( y );
-    double fabsx = reference_fabs( x );
-    double iy = reference_rint( fabsy );            //we do round to nearest here so that |fy| <= 0.5
-    if( iy > fabsy )//convert nearbyint to floor
+    // do the work required to sort out edge cases
+    double fabsy = reference_fabs(y);
+    double fabsx = reference_fabs(x);
+    double iy = reference_rint(
+        fabsy); // we do round to nearest here so that |fy| <= 0.5
+    if (iy > fabsy) // convert nearbyint to floor
         iy -= 1.0;
     int isOddInt = 0;
-    if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon )
-        isOddInt =     (int) (iy - 2.0 * rint( 0.5 * iy ));        //might be 0, -1, or 1
+    if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
+        isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
 
-    ///test a few more edge cases
-    //deal with x == 0 cases
-    if( x == 0.0 )
+    /// test a few more edge cases
+    // deal with x == 0 cases
+    if (x == 0.0)
     {
-        if( ! isOddInt )
-            x = 0.0;
+        if (!isOddInt) x = 0.0;
 
-        if( y < 0 )
-            x = 1.0/ x;
+        if (y < 0) x = 1.0 / x;
 
         return x;
     }
 
-    //x == +-Inf cases
-    if( isinf(fabsx) )
+    // x == +-Inf cases
+    if (isinf(fabsx))
     {
-        if( x < 0 )
+        if (x < 0)
         {
-            if( isOddInt )
+            if (isOddInt)
             {
-                if( y < 0 )
+                if (y < 0)
                     return -0.0;
                 else
                     return -INFINITY;
             }
             else
             {
-                if( y < 0 )
+                if (y < 0)
                     return 0.0;
                 else
                     return INFINITY;
             }
         }
 
-        if( y < 0 )
-            return 0;
+        if (y < 0) return 0;
         return INFINITY;
     }
 
-    //y = +-inf cases
-    if( isinf(fabsy) )
+    // y = +-inf cases
+    if (isinf(fabsy))
     {
-        if( x == -1 )
-            return 1;
+        if (x == -1) return 1;
 
-        if( y < 0 )
+        if (y < 0)
         {
-            if( fabsx < 1 )
-                return INFINITY;
+            if (fabsx < 1) return INFINITY;
             return 0;
         }
-        if( fabsx < 1 )
-            return 0;
+        if (fabsx < 1) return 0;
         return INFINITY;
     }
 
     // x < 0 and y non integer case
-    if( x < 0 && iy != fabsy )
+    if (x < 0 && iy != fabsy)
     {
-        //return nan;
+        // return nan;
         return cl_make_nan();
     }
 
-    //speedy resolution of sqrt and reciprocal sqrt
-    if( fabsy == 0.5 )
+    // speedy resolution of sqrt and reciprocal sqrt
+    if (fabsy == 0.5)
     {
-        long double xl = sqrtl( x );
-        if( y < 0 )
-            xl = 1.0/ xl;
+        long double xl = sqrtl(x);
+        if (y < 0) xl = 1.0 / xl;
         return xl;
     }
 
     double log2x_hi, log2x_lo;
 
-    // extended precision log .... accurate to at least 64-bits + couple of guard bits
+    // extended precision log .... accurate to at least 64-bits + couple of
+    // guard bits
     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
 
     double ylog2x_hi, ylog2x_lo;
 
-    double y_hi = (double) y;
-    double y_lo = (double) ( y - (long double) y_hi);
+    double y_hi = (double)y;
+    double y_lo = (double)(y - (long double)y_hi);
 
     // compute product of y*log2(x)
     // scale to avoid overflow in double-double multiplication
-    if( reference_fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) {
+    if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970))
+    {
         y_hi = reference_ldexp(y_hi, -53);
         y_lo = reference_ldexp(y_lo, -53);
     }
     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
-    if( fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) {
+    if (fabs(y) > HEX_DBL(+, 1, 0, +, 970))
+    {
         ylog2x_hi = reference_ldexp(ylog2x_hi, 53);
         ylog2x_lo = reference_ldexp(ylog2x_lo, 53);
     }
 
     long double powxy;
-    if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) {
-        powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY;
-    } else {
+    if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
+    {
+        powxy =
+            reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
+    }
+    else
+    {
         // separate integer + fractional part
         long int m = lrint(ylog2x_hi);
         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
 
         // revert to long double arithemtic
-        long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo;
-        long double tmp = reference_exp2l( ylog2x );
+        long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
+        long double tmp = reference_exp2l(ylog2x);
         powxy = reference_scalblnl(tmp, m);
     }
 
     // if y is odd integer and x is negative, reverse sign
-    if( isOddInt & reference_signbit(x))
-        powxy = -powxy;
+    if (isOddInt & reference_signbit(x)) powxy = -powxy;
     return powxy;
 }
 
 double reference_nextafter(double xx, double yy)
 {
-    float x = (float) xx;
-    float y = (float) yy;
+    float x = (float)xx;
+    float y = (float)yy;
 
     // take care of nans
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
-    if( x == y )
-        return y;
+    if (x == y) return y;
 
     int32f_t a, b;
 
-    a.f  = x;
-    b.f  = y;
+    a.f = x;
+    b.f = y;
 
-    if( a.i & 0x80000000 )
-        a.i = 0x80000000 - a.i;
-    if(b.i & 0x80000000 )
-        b.i = 0x80000000 - b.i;
+    if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
+    if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
 
     a.i += (a.i < b.i) ? 1 : -1;
-    a.i = (a.i < 0) ? (cl_int) 0x80000000 - a.i : a.i;
+    a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i;
 
     return a.f;
 }
@@ -4431,33 +4994,28 @@ double reference_nextafter(double xx, double yy)
 
 long double reference_nextafterl(long double xx, long double yy)
 {
-    double x = (double) xx;
-    double y = (double) yy;
+    double x = (double)xx;
+    double y = (double)yy;
 
     // take care of nans
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
     int64d_t a, b;
 
-    a.d  = x;
-    b.d  = y;
+    a.d = x;
+    b.d = y;
 
     int64_t tmp = 0x8000000000000000LL;
 
-    if( a.l & tmp )
-        a.l = tmp - a.l;
-    if(b.l & tmp )
-        b.l = tmp - b.l;
+    if (a.l & tmp) a.l = tmp - a.l;
+    if (b.l & tmp) b.l = tmp - b.l;
 
-    // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y = 0.0f)
-    // test needs to be done using integer rep because
-    // subnormals may be flushed to zero on some platforms
-    if( a.l == b.l )
-        return y;
+    // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y =
+    // 0.0f) test needs to be done using integer rep because subnormals may be
+    // flushed to zero on some platforms
+    if (a.l == b.l) return y;
 
     a.l += (a.l < b.l) ? 1 : -1;
     a.l = (a.l < 0) ? tmp - a.l : a.l;
@@ -4467,112 +5025,110 @@ long double reference_nextafterl(long double xx, long double yy)
 
 double reference_fdim(double xx, double yy)
 {
-    float x = (float) xx;
-    float y = (float) yy;
+    float x = (float)xx;
+    float y = (float)yy;
 
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
-    float r = ( x > y ) ? (float) reference_subtract( x, y) : 0.0f;
+    float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f;
     return r;
-
 }
 
 
 long double reference_fdiml(long double xx, long double yy)
 {
-    double x = (double) xx;
-    double y = (double) yy;
+    double x = (double)xx;
+    double y = (double)yy;
 
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
-    double r = ( x > y ) ? (double) reference_subtractl(x, y) : 0.0;
+    double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0;
     return r;
 }
 
 double reference_remquo(double xd, double yd, int *n)
 {
-    float xx = (float) xd;
-    float yy = (float) yd;
+    float xx = (float)xd;
+    float yy = (float)yd;
 
-    if( isnan(xx) || isnan(yy) ||
-        fabsf(xx) == INFINITY  ||
-        yy == 0.0 )
+    if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0)
     {
         *n = 0;
         return cl_make_nan();
     }
 
-    if( fabsf(yy) == INFINITY || xx == 0.0f ) {
+    if (fabsf(yy) == INFINITY || xx == 0.0f)
+    {
         *n = 0;
         return xd;
     }
 
-    if( fabsf(xx) == fabsf(yy) ) {
+    if (fabsf(xx) == fabsf(yy))
+    {
         *n = (xx == yy) ? 1 : -1;
-        return reference_signbit( xx ) ? -0.0 : 0.0;
+        return reference_signbit(xx) ? -0.0 : 0.0;
     }
 
-    int signx = reference_signbit( xx ) ? -1 : 1;
-    int signy = reference_signbit( yy ) ? -1 : 1;
+    int signx = reference_signbit(xx) ? -1 : 1;
+    int signy = reference_signbit(yy) ? -1 : 1;
     int signn = (signx == signy) ? 1 : -1;
     float x = fabsf(xx);
     float y = fabsf(yy);
 
     int ex, ey;
-    ex = reference_ilogb( x );
-    ey = reference_ilogb( y );
+    ex = reference_ilogb(x);
+    ey = reference_ilogb(y);
     float xr = x;
     float yr = y;
     uint32_t q = 0;
 
-    if(ex-ey >= -1) {
+    if (ex - ey >= -1)
+    {
 
-        yr = (float) reference_ldexp( y, -ey );
-        xr = (float) reference_ldexp( x, -ex );
+        yr = (float)reference_ldexp(y, -ey);
+        xr = (float)reference_ldexp(x, -ex);
 
-        if(ex-ey >= 0) {
+        if (ex - ey >= 0)
+        {
 
 
             int i;
-            for(i = ex-ey; i > 0; i--) {
+            for (i = ex - ey; i > 0; i--)
+            {
                 q <<= 1;
-                if(xr >= yr) {
+                if (xr >= yr)
+                {
                     xr -= yr;
                     q += 1;
                 }
                 xr += xr;
             }
             q <<= 1;
-            if( xr > yr ) {
+            if (xr > yr)
+            {
                 xr -= yr;
                 q += 1;
             }
         }
-        else //ex-ey = -1
-            xr = reference_ldexp(xr, ex-ey);
+        else // ex-ey = -1
+            xr = reference_ldexp(xr, ex - ey);
     }
 
-    if( (yr < 2.0f*xr) || ( (yr == 2.0f*xr) && (q & 0x00000001) ) ) {
+    if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001)))
+    {
         xr -= yr;
         q += 1;
     }
 
-    if(ex-ey >= -1)
-        xr = reference_ldexp(xr, ey);
+    if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
 
     int qout = q & 0x0000007f;
-    if( signn < 0)
-        qout = -qout;
-    if( xx < 0.0 )
-        xr = -xr;
+    if (signn < 0) qout = -qout;
+    if (xx < 0.0) xr = -xr;
 
     *n = qout;
 
@@ -4582,79 +5138,82 @@ double reference_remquo(double xd, double yd, int *n)
 long double reference_remquol(long double xd, long double yd, int *n)
 {
 
-    double xx = (double) xd;
-    double yy = (double) yd;
+    double xx = (double)xd;
+    double yy = (double)yd;
 
-    if( isnan(xx) || isnan(yy) ||
-        fabs(xx) == INFINITY  ||
-        yy == 0.0 )
+    if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0)
     {
         *n = 0;
         return cl_make_nan();
     }
 
-    if( reference_fabs(yy) == INFINITY || xx == 0.0 ) {
+    if (reference_fabs(yy) == INFINITY || xx == 0.0)
+    {
         *n = 0;
         return xd;
     }
 
-    if( reference_fabs(xx) == reference_fabs(yy) ) {
+    if (reference_fabs(xx) == reference_fabs(yy))
+    {
         *n = (xx == yy) ? 1 : -1;
-        return reference_signbit( xx ) ? -0.0 : 0.0;
+        return reference_signbit(xx) ? -0.0 : 0.0;
     }
 
-    int signx = reference_signbit( xx ) ? -1 : 1;
-    int signy = reference_signbit( yy ) ? -1 : 1;
+    int signx = reference_signbit(xx) ? -1 : 1;
+    int signy = reference_signbit(yy) ? -1 : 1;
     int signn = (signx == signy) ? 1 : -1;
     double x = reference_fabs(xx);
     double y = reference_fabs(yy);
 
     int ex, ey;
-    ex = reference_ilogbl( x );
-    ey = reference_ilogbl( y );
+    ex = reference_ilogbl(x);
+    ey = reference_ilogbl(y);
     double xr = x;
     double yr = y;
     uint32_t q = 0;
 
-    if(ex-ey >= -1) {
+    if (ex - ey >= -1)
+    {
 
-        yr = reference_ldexp( y, -ey );
-        xr = reference_ldexp( x, -ex );
+        yr = reference_ldexp(y, -ey);
+        xr = reference_ldexp(x, -ex);
         int i;
 
-        if(ex-ey >= 0) {
+        if (ex - ey >= 0)
+        {
 
-            for(i = ex-ey; i > 0; i--) {
+            for (i = ex - ey; i > 0; i--)
+            {
                 q <<= 1;
-                if(xr >= yr) {
+                if (xr >= yr)
+                {
                     xr -= yr;
                     q += 1;
                 }
                 xr += xr;
             }
             q <<= 1;
-            if( xr > yr ) {
+            if (xr > yr)
+            {
                 xr -= yr;
                 q += 1;
             }
         }
         else
-            xr = reference_ldexp(xr, ex-ey);
+            xr = reference_ldexp(xr, ex - ey);
     }
 
-    if( (yr < 2.0*xr) || ( (yr == 2.0*xr) && (q & 0x00000001) ) ) {
+    if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001)))
+    {
         xr -= yr;
         q += 1;
     }
 
-    if(ex-ey >= -1)
-        xr = reference_ldexp(xr, ey);
+    if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
 
     int qout = q & 0x0000007f;
-    if( signn < 0)
-        qout = -qout;
-    if( xx < 0.0 )
-        xr = -xr;
+    if (signn < 0) qout = -qout;
+    if (xx < 0.0) xr = -xr;
 
     *n = qout;
     return xr;
@@ -4662,27 +5221,27 @@ long double reference_remquol(long double xd, long double yd, int *n)
 
 static double reference_scalbn(double x, int n)
 {
-    if(reference_isinf(x) || reference_isnan(x) || x == 0.0)
-        return x;
+    if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x;
 
     int bias = 1023;
-    union { double d; cl_long l; } u;
-    u.d = (double) x;
+    union {
+        double d;
+        cl_long l;
+    } u;
+    u.d = (double)x;
     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
-    if(e == 0)
+    if (e == 0)
     {
         u.l |= ((cl_long)1023 << 52);
         u.d -= 1.0;
         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
     }
     e += n;
-    if(e >= 2047 || n >= 2098 )
-        return reference_copysign(INFINITY, x);
-    if(e < -51 || n <-2097 )
-        return reference_copysign(0.0, x);
-    if(e <= 0)
+    if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x);
+    if (e < -51 || n < -2097) return reference_copysign(0.0, x);
+    if (e <= 0)
     {
-        bias += (e-1);
+        bias += (e - 1);
         e = 1;
     }
     u.l &= 0x800fffffffffffffLL;
@@ -4695,26 +5254,26 @@ static double reference_scalbn(double x, int n)
 static long double reference_scalblnl(long double x, long n)
 {
 #if defined(__i386__) || defined(__x86_64__) // INTEL
-    union
-    {
+    union {
         long double d;
-        struct{ cl_ulong m; cl_ushort sexp;}u;
-    }u;
+        struct
+        {
+            cl_ulong m;
+            cl_ushort sexp;
+        } u;
+    } u;
     u.u.m = CL_LONG_MIN;
 
-    if ( reference_isinf(x) )
-        return x;
+    if (reference_isinf(x)) return x;
 
-    if( x == 0.0L || n < -2200)
-        return reference_copysignl( 0.0L, x );
+    if (x == 0.0L || n < -2200) return reference_copysignl(0.0L, x);
 
-    if( n > 2200 )
-        return reference_copysignl( INFINITY, x );
+    if (n > 2200) return reference_copysignl(INFINITY, x);
 
-    if( n < 0 )
+    if (n < 0)
     {
         u.u.sexp = 0x3fff - 1022;
-        while( n <= -1022 )
+        while (n <= -1022)
         {
             x *= u.d;
             n += 1022;
@@ -4724,10 +5283,10 @@ static long double reference_scalblnl(long double x, long n)
         return x;
     }
 
-    if( n > 0 )
+    if (n > 0)
     {
         u.u.sexp = 0x3fff + 1023;
-        while( n >= 1023 )
+        while (n >= 1023)
         {
             x *= u.d;
             n -= 1023;
@@ -4742,27 +5301,27 @@ static long double reference_scalblnl(long double x, long n)
 #elif defined(__arm__) // ARM .. sizeof(long double) == sizeof(double)
 
 #if __DBL_MAX_EXP__ >= __LDBL_MAX_EXP__
-    if(reference_isinfl(x) || reference_isnanl(x))
-        return x;
+    if (reference_isinfl(x) || reference_isnanl(x)) return x;
 
     int bias = 1023;
-    union { double d; cl_long l; } u;
-    u.d = (double) x;
+    union {
+        double d;
+        cl_long l;
+    } u;
+    u.d = (double)x;
     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
-    if(e == 0)
+    if (e == 0)
     {
         u.l |= ((cl_long)1023 << 52);
         u.d -= 1.0;
         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
     }
     e += n;
-    if(e >= 2047)
-        return reference_copysignl(INFINITY, x);
-    if(e < -51)
-        return reference_copysignl(0.0, x);
-    if(e <= 0)
+    if (e >= 2047) return reference_copysignl(INFINITY, x);
+    if (e < -51) return reference_copysignl(0.0, x);
+    if (e <= 0)
     {
-        bias += (e-1);
+        bias += (e - 1);
         e = 1;
     }
     u.l &= 0x800fffffffffffffLL;
@@ -4772,284 +5331,259 @@ static long double reference_scalblnl(long double x, long n)
     return x * u.d;
 #endif
 
-#else  // PPC
+#else // PPC
     return scalblnl(x, n);
 #endif
 }
 
-double reference_relaxed_exp( double x )
-{
-  return reference_exp(x);
-}
+double reference_relaxed_exp(double x) { return reference_exp(x); }
 
 double reference_exp(double x)
 {
-  return reference_exp2( x * HEX_DBL( +, 1, 71547652b82fe, +, 0 ) );
+    return reference_exp2(x * HEX_DBL(+, 1, 71547652b82fe, +, 0));
 }
 
 long double reference_expl(long double x)
 {
 #if defined(__PPC__)
-  long double scale, bias;
-
-  // The PPC double long version of expl fails to produce denorm results
-  // and instead generates a 0.0. Compensate for this limitation by
-  // computing expl as:
-  //     expl(x + 40) * expl(-40)
-  // Likewise, overflows can prematurely produce an infinity, so we
-  // compute expl as:
-  //     expl(x - 40) * expl(40)
-  scale = 1.0L;
-  bias = 0.0L;
-  if (x < -708.0L) {
-    bias = 40.0;
-    scale = expl(-40.0L);
-  } else if (x > 708.0L) {
-    bias = -40.0L;
-    scale = expl(40.0L);
-  }
-  return expl(x + bias) * scale;
+    long double scale, bias;
+
+    // The PPC double long version of expl fails to produce denorm results
+    // and instead generates a 0.0. Compensate for this limitation by
+    // computing expl as:
+    //     expl(x + 40) * expl(-40)
+    // Likewise, overflows can prematurely produce an infinity, so we
+    // compute expl as:
+    //     expl(x - 40) * expl(40)
+    scale = 1.0L;
+    bias = 0.0L;
+    if (x < -708.0L)
+    {
+        bias = 40.0;
+        scale = expl(-40.0L);
+    }
+    else if (x > 708.0L)
+    {
+        bias = -40.0L;
+        scale = expl(40.0L);
+    }
+    return expl(x + bias) * scale;
 #else
-    return expl( x );
+    return expl(x);
 #endif
 }
 
-double reference_sinh(double x)
-{
-    return sinh(x);
-}
+double reference_sinh(double x) { return sinh(x); }
 
-long double reference_sinhl(long double x)
-{
-    return sinhl(x);
-}
+long double reference_sinhl(long double x) { return sinhl(x); }
 
 double reference_fmod(double x, double y)
 {
-    if( x == 0.0 && fabs(y) > 0.0 )
-        return x;
+    if (x == 0.0 && fabs(y) > 0.0) return x;
 
-    if( fabs(x) == INFINITY || y == 0 )
-        return cl_make_nan();
+    if (fabs(x) == INFINITY || y == 0) return cl_make_nan();
 
-    if( fabs(y) == INFINITY )    // we know x is finite from above
+    if (fabs(y) == INFINITY) // we know x is finite from above
         return x;
 #if defined(_MSC_VER) && defined(_M_X64)
-    return fmod( x, y );
+    return fmod(x, y);
 #else
-    return fmodf( (float) x, (float) y );
+    return fmodf((float)x, (float)y);
 #endif
 }
 
 long double reference_fmodl(long double x, long double y)
 {
-    if( x == 0.0L && fabsl(y) > 0.0L )
-        return x;
+    if (x == 0.0L && fabsl(y) > 0.0L) return x;
 
-    if( fabsl(x) == INFINITY || y == 0.0L )
-        return cl_make_nan();
+    if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan();
 
-    if( fabsl(y) == INFINITY )    // we know x is finite from above
+    if (fabsl(y) == INFINITY) // we know x is finite from above
         return x;
 
-    return fmod( (double) x, (double) y );
+    return fmod((double)x, (double)y);
 }
 
 double reference_modf(double x, double *n)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *n = cl_make_nan();
         return cl_make_nan();
     }
     float nr;
-    float yr = modff((float) x, &nr);
+    float yr = modff((float)x, &nr);
     *n = nr;
     return yr;
 }
 
 long double reference_modfl(long double x, long double *n)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *n = cl_make_nan();
         return cl_make_nan();
     }
     double nr;
-    double yr = modf((double) x, &nr);
+    double yr = modf((double)x, &nr);
     *n = nr;
     return yr;
 }
 
-long double reference_fractl(long double x, long double *ip )
+long double reference_fractl(long double x, long double *ip)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *ip = cl_make_nan();
         return cl_make_nan();
     }
 
     double i;
-    double f = modf((double) x, &i );
-    if( f < 0.0 )
+    double f = modf((double)x, &i);
+    if (f < 0.0)
     {
         f = 1.0 + f;
         i -= 1.0;
-        if( f == 1.0 )
-            f = HEX_DBL( +, 1, fffffffffffff, -, 1 );
+        if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1);
     }
     *ip = i;
     return f;
 }
 
-long double reference_fabsl(long double x)
-{
-    return fabsl( x );
-}
+long double reference_fabsl(long double x) { return fabsl(x); }
 
-double reference_relaxed_log( double x )
+double reference_relaxed_log(double x)
 {
-  return (float)reference_log((float)x);
+    return (float)reference_log((float)x);
 }
 
 double reference_log(double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 );
+    double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
-    return logxHi*log2Hi;
+    return logxHi * log2Hi;
 }
 
 long double reference_logl(long double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 );
-    double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 );
+    double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
+    double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
 
-    //double rhi, rlo;
-    //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
-    //return (long double) rhi + (long double) rlo;
+    // double rhi, rlo;
+    // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
+    // return (long double) rhi + (long double) rlo;
 
-    long double lg2 = (long double) log2Hi + (long double) log2Lo;
-    long double logx = (long double) logxHi + (long double) logxLo;
-    return logx*lg2;
+    long double lg2 = (long double)log2Hi + (long double)log2Lo;
+    long double logx = (long double)logxHi + (long double)logxLo;
+    return logx * lg2;
 }
 
-double reference_relaxed_pow( double x, double y) {
-  return (float)reference_exp2( ((float)y) * (float)reference_log2((float)x));
+double reference_relaxed_pow(double x, double y)
+{
+    return (float)reference_exp2(((float)y) * (float)reference_log2((float)x));
 }
 
-double reference_pow( double x, double y )
+double reference_pow(double x, double y)
 {
-    static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 );
+    static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
 
-    //if x = 1, return x for any y, even NaN
-    if( x == 1.0 )
-        return x;
+    // if x = 1, return x for any y, even NaN
+    if (x == 1.0) return x;
 
-    //if y == 0, return 1 for any x, even NaN
-    if( y == 0.0 )
-        return 1.0;
+    // if y == 0, return 1 for any x, even NaN
+    if (y == 0.0) return 1.0;
 
-    //get NaNs out of the way
-    if( x != x  || y != y )
-        return x + y;
+    // get NaNs out of the way
+    if (x != x || y != y) return x + y;
 
-    //do the work required to sort out edge cases
-    double fabsy = reference_fabs( y );
-    double fabsx = reference_fabs( x );
-    double iy = reference_rint( fabsy );            //we do round to nearest here so that |fy| <= 0.5
-    if( iy > fabsy )//convert nearbyint to floor
+    // do the work required to sort out edge cases
+    double fabsy = reference_fabs(y);
+    double fabsx = reference_fabs(x);
+    double iy = reference_rint(
+        fabsy); // we do round to nearest here so that |fy| <= 0.5
+    if (iy > fabsy) // convert nearbyint to floor
         iy -= 1.0;
     int isOddInt = 0;
-    if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon )
-        isOddInt =     (int) (iy - 2.0 * rint( 0.5 * iy ));        //might be 0, -1, or 1
+    if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
+        isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
 
-    ///test a few more edge cases
-    //deal with x == 0 cases
-    if( x == 0.0 )
+    /// test a few more edge cases
+    // deal with x == 0 cases
+    if (x == 0.0)
     {
-        if( ! isOddInt )
-            x = 0.0;
+        if (!isOddInt) x = 0.0;
 
-        if( y < 0 )
-            x = 1.0/ x;
+        if (y < 0) x = 1.0 / x;
 
         return x;
     }
 
-    //x == +-Inf cases
-    if( isinf(fabsx) )
+    // x == +-Inf cases
+    if (isinf(fabsx))
     {
-        if( x < 0 )
+        if (x < 0)
         {
-            if( isOddInt )
+            if (isOddInt)
             {
-                if( y < 0 )
+                if (y < 0)
                     return -0.0;
                 else
                     return -INFINITY;
             }
             else
             {
-                if( y < 0 )
+                if (y < 0)
                     return 0.0;
                 else
                     return INFINITY;
             }
         }
 
-        if( y < 0 )
-            return 0;
+        if (y < 0) return 0;
         return INFINITY;
     }
 
-    //y = +-inf cases
-    if( isinf(fabsy) )
+    // y = +-inf cases
+    if (isinf(fabsy))
     {
-        if( x == -1 )
-            return 1;
+        if (x == -1) return 1;
 
-        if( y < 0 )
+        if (y < 0)
         {
-            if( fabsx < 1 )
-                return INFINITY;
+            if (fabsx < 1) return INFINITY;
             return 0;
         }
-        if( fabsx < 1 )
-            return 0;
+        if (fabsx < 1) return 0;
         return INFINITY;
     }
 
     // x < 0 and y non integer case
-    if( x < 0 && iy != fabsy )
+    if (x < 0 && iy != fabsy)
     {
-        //return nan;
+        // return nan;
         return cl_make_nan();
     }
 
-    //speedy resolution of sqrt and reciprocal sqrt
-    if( fabsy == 0.5 )
+    // speedy resolution of sqrt and reciprocal sqrt
+    if (fabsy == 0.5)
     {
-        long double xl = reference_sqrt( x );
-        if( y < 0 )
-            xl = 1.0/ xl;
+        long double xl = reference_sqrt(x);
+        if (y < 0) xl = 1.0 / xl;
         return xl;
     }
 
@@ -5060,73 +5594,55 @@ double reference_pow( double x, double y )
     return isOddInt ? reference_copysignd(result, x) : result;
 }
 
-double reference_sqrt(double x)
-{
-    return sqrt(x);
-}
+double reference_sqrt(double x) { return sqrt(x); }
 
-double reference_floor(double x)
-{
-    return floorf((float) x);
-}
+double reference_floor(double x) { return floorf((float)x); }
 
 double reference_ldexp(double value, int exponent)
 {
 #ifdef __MINGW32__
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if(!finite(value)||value==0.0) return value;
-    return scalbn(value,exponent);
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (!finite(value) || value == 0.0) return value;
+    return scalbn(value, exponent);
 #else
     return reference_scalbn(value, exponent);
 #endif
 }
 
-long double reference_ldexpl(long double x, int n)
-{
-    return ldexpl( x, n);
-}
+long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); }
 
-long double reference_coshl(long double x)
-{
-    return coshl(x);
-}
+long double reference_coshl(long double x) { return coshl(x); }
 
-double reference_ceil(double x)
-{
-    return ceilf((float) x);
-}
+double reference_ceil(double x) { return ceilf((float)x); }
 
 long double reference_ceill(long double x)
 {
-    if( x == 0.0 || reference_isinfl(x) || reference_isnanl(x) )
-        return x;
+    if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
 
     long double absx = reference_fabsl(x);
-    if( absx >= HEX_LDBL( +, 1, 0, +, 52 ) )
-        return x;
+    if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
 
-    if( absx < 1.0 )
+    if (absx < 1.0)
     {
-        if( x < 0.0 )
+        if (x < 0.0)
             return 0.0;
         else
             return 1.0;
     }
 
-    long double r = (long double) ((cl_long) x);
+    long double r = (long double)((cl_long)x);
 
-    if( x > 0.0 && r < x )
-        r += 1.0;
+    if (x > 0.0 && r < x) r += 1.0;
 
     return r;
 }
@@ -5137,45 +5653,53 @@ long double reference_acosl(long double x)
     long double x2 = x * x;
     int i;
 
-    //Prepare a head + tail representation of PI in long double.  A good compiler should get rid of all of this work.
-    static const cl_ulong pi_bits[2] = { 0x3243F6A8885A308DULL, 0x313198A2E0370734ULL};  // first 126 bits of pi http://www.super-computing.org/pi-hexa_current.html
+    // Prepare a head + tail representation of PI in long double.  A good
+    // compiler should get rid of all of this work.
+    static const cl_ulong pi_bits[2] = {
+        0x3243F6A8885A308DULL, 0x313198A2E0370734ULL
+    }; // first 126 bits of pi
+       // http://www.super-computing.org/pi-hexa_current.html
     long double head, tail, temp;
 #if __LDBL_MANT_DIG__ >= 64
     // long double has 64-bits of precision or greater
-    temp = (long double) pi_bits[0] * 0x1.0p64L;
-    head = temp + (long double) pi_bits[1];
-    temp -= head;           // rounding err rounding pi_bits[1] into head
-    tail = (long double) pi_bits[1] + temp;
-    head *= HEX_LDBL( +, 1, 0, -, 125 );
-    tail *= HEX_LDBL( +, 1, 0, -, 125 );
+    temp = (long double)pi_bits[0] * 0x1.0p64L;
+    head = temp + (long double)pi_bits[1];
+    temp -= head; // rounding err rounding pi_bits[1] into head
+    tail = (long double)pi_bits[1] + temp;
+    head *= HEX_LDBL(+, 1, 0, -, 125);
+    tail *= HEX_LDBL(+, 1, 0, -, 125);
 #else
-    head = (long double) pi_bits[0];
-    tail = (long double) ((cl_long) pi_bits[0] - (cl_long) head );       // residual part of pi_bits[0] after rounding
-    tail = tail * HEX_LDBL( +, 1, 0, +, 64 ) + (long double) pi_bits[1];
-    head *= HEX_LDBL( +, 1, 0, -, 61 );
-    tail *= HEX_LDBL( +, 1, 0, -, 125 );
+    head = (long double)pi_bits[0];
+    tail =
+        (long double)((cl_long)pi_bits[0]
+                      - (cl_long)
+                          head); // residual part of pi_bits[0] after rounding
+    tail = tail * HEX_LDBL(+, 1, 0, +, 64) + (long double)pi_bits[1];
+    head *= HEX_LDBL(+, 1, 0, -, 61);
+    tail *= HEX_LDBL(+, 1, 0, -, 125);
 #endif
 
     // oversize values and NaNs go to NaN
-    if( ! (x2 <= 1.0) )
-        return sqrtl(1.0L - x2 );
+    if (!(x2 <= 1.0)) return sqrtl(1.0L - x2);
 
     //
     // deal with large |x|:
     //                                                      sqrt( 1 - x**2)
-    // acos(|x| > sqrt(0.5)) = 2 * atan( z );       z = -------------------- ;      z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
+    // acos(|x| > sqrt(0.5)) = 2 * atan( z );       z = -------------------- ;
+    // z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
     //                                                          1 + x
-    if( x2 > 0.5 )
+    if (x2 > 0.5)
     {
         // we handle the x < 0 case as pi - acos(|x|)
 
-        long double sign = reference_copysignl( 1.0L, x );
-        long double fabsx = reference_fabsl( x );
-        head -= head * sign;        // x > 0 ? 0 : pi.hi
-        tail -= tail * sign;        // x > 0 ? 0 : pi.low
+        long double sign = reference_copysignl(1.0L, x);
+        long double fabsx = reference_fabsl(x);
+        head -= head * sign; // x > 0 ? 0 : pi.hi
+        tail -= tail * sign; // x > 0 ? 0 : pi.low
 
-        // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt( (1-x)/(1+x) )
-        long double z2 = (1.0L - fabsx) / (1.0L + fabsx);   // z**2
+        // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt(
+        // (1-x)/(1+x) )
+        long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2
         long double z = sign * sqrtl(z2);
 
         //                     atan(sqrt(q))
@@ -5185,29 +5709,41 @@ long double reference_acosl(long double x)
         // Define q = r*r, and solve for atan(r):
         //
         //  atan(r) = (p(r) + 1) * r = rp(r) + r
-        static long double atan_coeffs[] = { HEX_LDBL( -, b, 3f52e0c278293b3, -, 67 ), HEX_LDBL( -, a, aaaaaaaaaaa95b8, -, 5 ),
-                                             HEX_LDBL( +, c, ccccccccc992407, -,  6 ), HEX_LDBL( -, 9, 24924923024398,  -, 6 ),
-                                             HEX_LDBL( +, e, 38e38d6f92c98f3, -,  7 ), HEX_LDBL( -, b, a2e89bfb8393ec6, -, 7 ),
-                                             HEX_LDBL( +, 9, d89a9f574d412cb, -,  7 ), HEX_LDBL( -, 8, 88580517884c547, -, 7 ),
-                                             HEX_LDBL( +, f, 0ab6756abdad408, -,  8 ), HEX_LDBL( -, d, 56a5b07a2f15b49, -, 8 ),
-                                             HEX_LDBL( +, b, 72ab587e46d80b2, -,  8 ), HEX_LDBL( -, 8, 62ea24bb5b2e636, -, 8 ),
-                                             HEX_LDBL( +, e, d67c16582123937, -, 10 ) }; // minimax fit over [ 0x1.0p-52, 0.18]   Max error:  0x1.67ea5c184e5d9p-64
+        static long double atan_coeffs[] = {
+            HEX_LDBL(-, b, 3f52e0c278293b3, -, 67),
+            HEX_LDBL(-, a, aaaaaaaaaaa95b8, -, 5),
+            HEX_LDBL(+, c, ccccccccc992407, -, 6),
+            HEX_LDBL(-, 9, 24924923024398, -, 6),
+            HEX_LDBL(+, e, 38e38d6f92c98f3, -, 7),
+            HEX_LDBL(-, b, a2e89bfb8393ec6, -, 7),
+            HEX_LDBL(+, 9, d89a9f574d412cb, -, 7),
+            HEX_LDBL(-, 8, 88580517884c547, -, 7),
+            HEX_LDBL(+, f, 0ab6756abdad408, -, 8),
+            HEX_LDBL(-, d, 56a5b07a2f15b49, -, 8),
+            HEX_LDBL(+, b, 72ab587e46d80b2, -, 8),
+            HEX_LDBL(-, 8, 62ea24bb5b2e636, -, 8),
+            HEX_LDBL(+, e, d67c16582123937, -, 10)
+        }; // minimax fit over [ 0x1.0p-52, 0.18]   Max error:
+           // 0x1.67ea5c184e5d9p-64
 
         // Calculate y = p(r)
-        const size_t atan_coeff_count = sizeof( atan_coeffs ) / sizeof( atan_coeffs[0] );
-        long double y = atan_coeffs[ atan_coeff_count - 1];
-        for( i = (int)atan_coeff_count - 2; i >= 0; i-- )
+        const size_t atan_coeff_count =
+            sizeof(atan_coeffs) / sizeof(atan_coeffs[0]);
+        long double y = atan_coeffs[atan_coeff_count - 1];
+        for (i = (int)atan_coeff_count - 2; i >= 0; i--)
             y = atan_coeffs[i] + y * z2;
 
-        z *= 2.0L;   // fold in 2.0 for 2.0 * atan(z)
-        y *= z;      // rp(r)
+        z *= 2.0L; // fold in 2.0 for 2.0 * atan(z)
+        y *= z; // rp(r)
 
         return head + ((y + tail) + z);
     }
 
     // do |x| <= sqrt(0.5) here
-    //                                                     acos( sqrt(z) ) - PI/2
-    //  Piecewise minimax polynomial fits for p(z) = 1 + ------------------------;
+    //                                                     acos( sqrt(z) ) -
+    //                                                     PI/2
+    //  Piecewise minimax polynomial fits for p(z) = 1 +
+    //  ------------------------;
     //                                                            sqrt(z)
     //
     //  Define z = x*x, and solve for acos(x) over x in  x >= 0:
@@ -5215,52 +5751,88 @@ long double reference_acosl(long double x)
     //      acos( sqrt(z) ) = acos(x) = x*(p(z)-1) + PI/2 = xp(x**2) - x + PI/2
     //
     const long double coeffs[4][14] = {
-                                    { HEX_LDBL( -, a, fa7382e1f347974, -, 10 ), HEX_LDBL( -, b, 4d5a992de1ac4da, -,  6 ),
-                                      HEX_LDBL( -, a, c526184bd558c17, -,  7 ), HEX_LDBL( -, d, 9ed9b0346ec092a, -,  8 ),
-                                      HEX_LDBL( -, 9, dca410c1f04b1f,  -,  8 ), HEX_LDBL( -, f, 76e411ba9581ee5, -,  9 ),
-                                      HEX_LDBL( -, c, c71b00479541d8e, -,  9 ), HEX_LDBL( -, a, f527a3f9745c9de, -,  9 ),
-                                      HEX_LDBL( -, 9, a93060051f48d14, -,  9 ), HEX_LDBL( -, 8, b3d39ad70e06021, -,  9 ),
-                                      HEX_LDBL( -, f, f2ab95ab84f79c,  -, 10 ), HEX_LDBL( -, e, d1af5f5301ccfe4, -, 10 ),
-                                      HEX_LDBL( -, e, 1b53ba562f0f74a, -, 10 ), HEX_LDBL( -, d, 6a3851330e15526, -, 10 ) },  // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]    Error: 0x1.97839bf07024p-76
-
-                                    { HEX_LDBL( -, 8, c2f1d638e4c1b48, -,  8 ), HEX_LDBL( -, c, d47ac903c311c2c, -,  6 ),
-                                      HEX_LDBL( -, d, e020b2dabd5606a, -,  7 ), HEX_LDBL( -, a, 086fafac220f16b, -,  7 ),
-                                      HEX_LDBL( -, 8, 55b5efaf6b86c3e, -,  7 ), HEX_LDBL( -, f, 05c9774fed2f571, -,  8 ),
-                                      HEX_LDBL( -, e, 484a93f7f0fc772, -,  8 ), HEX_LDBL( -, e, 1a32baef01626e4, -,  8 ),
-                                      HEX_LDBL( -, e, 528e525b5c9c73d, -,  8 ), HEX_LDBL( -, e, ddd5d27ad49b2c8, -,  8 ),
-                                      HEX_LDBL( -, f, b3259e7ae10c6f,  -,  8 ), HEX_LDBL( -, 8, 68998170d5b19b7, -,  7 ),
-                                      HEX_LDBL( -, 9, 4468907f007727,  -,  7 ), HEX_LDBL( -, a, 2ad5e4906a8e7b3, -,  7 ) },// x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ]    Error: 0x1.647af70073457p-73
-
-                                    { HEX_LDBL( -, f, a76585ad399e7ac, -,  8 ), HEX_LDBL( -, e, d665b7dd504ca7c, -,  6 ),
-                                      HEX_LDBL( -, 9, 4c7c2402bd4bc33, -,  6 ), HEX_LDBL( -, f, ba76b69074ff71c, -,  7 ),
-                                      HEX_LDBL( -, f, 58117784bdb6d5f, -,  7 ), HEX_LDBL( -, 8, 22ddd8eef53227d, -,  6 ),
-                                      HEX_LDBL( -, 9, 1d1d3b57a63cdb4, -,  6 ), HEX_LDBL( -, a, 9c4bdc40cca848,  -,  6 ),
-                                      HEX_LDBL( -, c, b673b12794edb24, -,  6 ), HEX_LDBL( -, f, 9290a06e31575bf, -,  6 ),
-                                      HEX_LDBL( -, 9, b4929c16aeb3d1f, -,  5 ), HEX_LDBL( -, c, 461e725765a7581, -,  5 ),
-                                      HEX_LDBL( -, 8, 0a59654c98d9207, -,  4 ), HEX_LDBL( -, a, 6de6cbd96c80562, -,  4 ) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ]   Error: 0x1.b0246c304ce1ap-70
-
-                                    { HEX_LDBL( -, b, dca8b0359f96342, -,  7 ), HEX_LDBL( -, 8, cd2522fcde9823,  -,  5 ),
-                                      HEX_LDBL( -, d, 2af9397b27ff74d, -,  6 ), HEX_LDBL( -, d, 723f2c2c2409811, -,  6 ),
-                                      HEX_LDBL( -, f, ea8f8481ecc3cd1, -,  6 ), HEX_LDBL( -, a, 43fd8a7a646b0b2, -,  5 ),
-                                      HEX_LDBL( -, e, 01b0bf63a4e8d76, -,  5 ), HEX_LDBL( -, 9, f0b7096a2a7b4d,  -,  4 ),
-                                      HEX_LDBL( -, e, 872e7c5a627ab4c, -,  4 ), HEX_LDBL( -, a, dbd760a1882da48, -,  3 ),
-                                      HEX_LDBL( -, 8, 424e4dea31dd273, -,  2 ), HEX_LDBL( -, c, c05d7730963e793, -,  2 ),
-                                      HEX_LDBL( -, a, 523d97197cd124a, -,  1 ), HEX_LDBL( -, 8, 307ba943978aaee, +,  0 ) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ]  Error: 0x1.9ecff73da69c9p-66
-                                 };
+        { HEX_LDBL(-, a, fa7382e1f347974, -, 10),
+          HEX_LDBL(-, b, 4d5a992de1ac4da, -, 6),
+          HEX_LDBL(-, a, c526184bd558c17, -, 7),
+          HEX_LDBL(-, d, 9ed9b0346ec092a, -, 8),
+          HEX_LDBL(-, 9, dca410c1f04b1f, -, 8),
+          HEX_LDBL(-, f, 76e411ba9581ee5, -, 9),
+          HEX_LDBL(-, c, c71b00479541d8e, -, 9),
+          HEX_LDBL(-, a, f527a3f9745c9de, -, 9),
+          HEX_LDBL(-, 9, a93060051f48d14, -, 9),
+          HEX_LDBL(-, 8, b3d39ad70e06021, -, 9),
+          HEX_LDBL(-, f, f2ab95ab84f79c, -, 10),
+          HEX_LDBL(-, e, d1af5f5301ccfe4, -, 10),
+          HEX_LDBL(-, e, 1b53ba562f0f74a, -, 10),
+          HEX_LDBL(-, d, 6a3851330e15526, -,
+                   10) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]
+                          // Error: 0x1.97839bf07024p-76
+
+        { HEX_LDBL(-, 8, c2f1d638e4c1b48, -, 8),
+          HEX_LDBL(-, c, d47ac903c311c2c, -, 6),
+          HEX_LDBL(-, d, e020b2dabd5606a, -, 7),
+          HEX_LDBL(-, a, 086fafac220f16b, -, 7),
+          HEX_LDBL(-, 8, 55b5efaf6b86c3e, -, 7),
+          HEX_LDBL(-, f, 05c9774fed2f571, -, 8),
+          HEX_LDBL(-, e, 484a93f7f0fc772, -, 8),
+          HEX_LDBL(-, e, 1a32baef01626e4, -, 8),
+          HEX_LDBL(-, e, 528e525b5c9c73d, -, 8),
+          HEX_LDBL(-, e, ddd5d27ad49b2c8, -, 8),
+          HEX_LDBL(-, f, b3259e7ae10c6f, -, 8),
+          HEX_LDBL(-, 8, 68998170d5b19b7, -, 7),
+          HEX_LDBL(-, 9, 4468907f007727, -, 7),
+          HEX_LDBL(-, a, 2ad5e4906a8e7b3, -,
+                   7) }, // x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ]    Error:
+                         // 0x1.647af70073457p-73
+
+        { HEX_LDBL(-, f, a76585ad399e7ac, -, 8),
+          HEX_LDBL(-, e, d665b7dd504ca7c, -, 6),
+          HEX_LDBL(-, 9, 4c7c2402bd4bc33, -, 6),
+          HEX_LDBL(-, f, ba76b69074ff71c, -, 7),
+          HEX_LDBL(-, f, 58117784bdb6d5f, -, 7),
+          HEX_LDBL(-, 8, 22ddd8eef53227d, -, 6),
+          HEX_LDBL(-, 9, 1d1d3b57a63cdb4, -, 6),
+          HEX_LDBL(-, a, 9c4bdc40cca848, -, 6),
+          HEX_LDBL(-, c, b673b12794edb24, -, 6),
+          HEX_LDBL(-, f, 9290a06e31575bf, -, 6),
+          HEX_LDBL(-, 9, b4929c16aeb3d1f, -, 5),
+          HEX_LDBL(-, c, 461e725765a7581, -, 5),
+          HEX_LDBL(-, 8, 0a59654c98d9207, -, 4),
+          HEX_LDBL(-, a, 6de6cbd96c80562, -,
+                   4) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ]   Error:
+                         // 0x1.b0246c304ce1ap-70
+
+        { HEX_LDBL(-, b, dca8b0359f96342, -, 7),
+          HEX_LDBL(-, 8, cd2522fcde9823, -, 5),
+          HEX_LDBL(-, d, 2af9397b27ff74d, -, 6),
+          HEX_LDBL(-, d, 723f2c2c2409811, -, 6),
+          HEX_LDBL(-, f, ea8f8481ecc3cd1, -, 6),
+          HEX_LDBL(-, a, 43fd8a7a646b0b2, -, 5),
+          HEX_LDBL(-, e, 01b0bf63a4e8d76, -, 5),
+          HEX_LDBL(-, 9, f0b7096a2a7b4d, -, 4),
+          HEX_LDBL(-, e, 872e7c5a627ab4c, -, 4),
+          HEX_LDBL(-, a, dbd760a1882da48, -, 3),
+          HEX_LDBL(-, 8, 424e4dea31dd273, -, 2),
+          HEX_LDBL(-, c, c05d7730963e793, -, 2),
+          HEX_LDBL(-, a, 523d97197cd124a, -, 1),
+          HEX_LDBL(-, 8, 307ba943978aaee, +,
+                   0) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ]  Error:
+                        // 0x1.9ecff73da69c9p-66
+    };
 
     const long double offsets[4] = { 0.0625, 0.1875, 0.3125, 0.4375 };
-    const size_t coeff_count = sizeof( coeffs[0] ) / sizeof( coeffs[0][0] );
+    const size_t coeff_count = sizeof(coeffs[0]) / sizeof(coeffs[0][0]);
 
-    // reduce the incoming values a bit so that they are in the range [-0x1.0p-4, 0x1.0p-4]
+    // reduce the incoming values a bit so that they are in the range
+    // [-0x1.0p-4, 0x1.0p-4]
     const long double *c;
     i = x2 * 8.0L;
     c = coeffs[i];
-    x2 -= offsets[i];       // exact
+    x2 -= offsets[i]; // exact
 
     // calcualte p(x2)
-    long double y = c[ coeff_count - 1];
-    for( i = (int)coeff_count - 2; i >= 0; i-- )
-        y = c[i] + y * x2;
+    long double y = c[coeff_count - 1];
+    for (i = (int)coeff_count - 2; i >= 0; i--) y = c[i] + y * x2;
 
     // xp(x2)
     y *= x;
@@ -5273,58 +5845,50 @@ double reference_relaxed_acos(double x) { return reference_acos(x); }
 
 double reference_log10(double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 );
+    double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
-    return logxHi*log2Hi;
+    return logxHi * log2Hi;
 }
 
 double reference_relaxed_log10(double x) { return reference_log10(x); }
 
 long double reference_log10l(long double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 );
-    double log2Lo = HEX_DBL( +, 1, e623e2566b02d, -, 55 );
+    double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
+    double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
 
-    //double rhi, rlo;
-    //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
-    //return (long double) rhi + (long double) rlo;
+    // double rhi, rlo;
+    // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
+    // return (long double) rhi + (long double) rlo;
 
-    long double lg2 = (long double) log2Hi + (long double) log2Lo;
-    long double logx = (long double) logxHi + (long double) logxLo;
-    return logx*lg2;
+    long double lg2 = (long double)log2Hi + (long double)log2Lo;
+    long double logx = (long double)logxHi + (long double)logxLo;
+    return logx * lg2;
 }
 
-double reference_acos(double x)
-{
-    return acos( x );
-}
+double reference_acos(double x) { return acos(x); }
 
 double reference_atan2(double x, double y)
 {
 #if defined(_WIN32)
     // fix edge cases for Windows
-    if (isinf(x) && isinf(y)) {
+    if (isinf(x) && isinf(y))
+    {
         double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
         return (x > 0) ? retval : -retval;
     }
@@ -5336,7 +5900,8 @@ long double reference_atan2l(long double x, long double y)
 {
 #if defined(_WIN32)
     // fix edge cases for Windows
-    if (isinf(x) && isinf(y)) {
+    if (isinf(x) && isinf(y))
+    {
         long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
         return (x > 0) ? retval : -retval;
     }
@@ -5346,7 +5911,7 @@ long double reference_atan2l(long double x, long double y)
 
 double reference_frexp(double a, int *exp)
 {
-    if(isnan(a) || isinf(a) || a == 0.0)
+    if (isnan(a) || isinf(a) || a == 0.0)
     {
         *exp = 0;
         return a;
@@ -5364,7 +5929,7 @@ double reference_frexp(double a, int *exp)
     u.l &= 0x7fffffffffffffffULL;
     int bias = -1022;
 
-    if((u.l & 0x7ff0000000000000ULL) == 0)
+    if ((u.l & 0x7ff0000000000000ULL) == 0)
     {
         double d = u.l;
         u.d = d;
@@ -5383,13 +5948,13 @@ double reference_frexp(double a, int *exp)
 
 long double reference_frexpl(long double a, int *exp)
 {
-    if(isnan(a) || isinf(a) || a == 0.0)
+    if (isnan(a) || isinf(a) || a == 0.0)
     {
         *exp = 0;
         return a;
     }
 
-    if(sizeof(long double) == sizeof(double))
+    if (sizeof(long double) == sizeof(double))
     {
         return reference_frexp(a, exp);
     }
@@ -5400,92 +5965,64 @@ long double reference_frexpl(long double a, int *exp)
 }
 
 
-double reference_atan(double x)
-{
-    return atan( x );
-}
+double reference_atan(double x) { return atan(x); }
 
-long double reference_atanl(long double x)
-{
-    return atanl( x );
-}
+long double reference_atanl(long double x) { return atanl(x); }
 
-long double reference_asinl(long double x)
-{
-    return asinl( x );
-}
+long double reference_asinl(long double x) { return asinl(x); }
 
-double reference_asin(double x)
-{
-    return asin( x );
-}
+double reference_asin(double x) { return asin(x); }
 
 double reference_relaxed_asin(double x) { return reference_asin(x); }
 
-double reference_fabs(double x)
-{
-    return fabs( x);
-}
+double reference_fabs(double x) { return fabs(x); }
 
-double reference_cosh(double x)
-{
-    return cosh( x );
-}
+double reference_cosh(double x) { return cosh(x); }
 
 long double reference_sqrtl(long double x)
 {
-#if defined( __SSE2__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
-    __m128d result128 = _mm_set_sd((double) x);
+#if defined(__SSE2__)                                                          \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
+    __m128d result128 = _mm_set_sd((double)x);
     result128 = _mm_sqrt_sd(result128, result128);
     return _mm_cvtsd_f64(result128);
 #else
     volatile double dx = x;
-    return sqrt( dx );
+    return sqrt(dx);
 #endif
 }
 
-long double reference_tanhl(long double x)
-{
-    return tanhl( x );
-}
+long double reference_tanhl(long double x) { return tanhl(x); }
 
 long double reference_floorl(long double x)
 {
-    if( x == 0.0 || reference_isinfl(x) || reference_isnanl(x) )
-        return x;
+    if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
 
     long double absx = reference_fabsl(x);
-    if( absx >= HEX_LDBL( +, 1, 0, +, 52 ) )
-        return x;
+    if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
 
-    if( absx < 1.0 )
+    if (absx < 1.0)
     {
-        if( x < 0.0 )
+        if (x < 0.0)
             return -1.0;
         else
             return 0.0;
     }
 
-    long double r = (long double) ((cl_long) x);
+    long double r = (long double)((cl_long)x);
 
-    if( x < 0.0 && r > x )
-        r -= 1.0;
+    if (x < 0.0 && r > x) r -= 1.0;
 
     return r;
 }
 
 
-double reference_tanh(double x)
-{
-    return tanh( x );
-}
+double reference_tanh(double x) { return tanh(x); }
 
-long double reference_assignmentl( long double x ){ return x; }
+long double reference_assignmentl(long double x) { return x; }
 
-int reference_notl( long double x )
+int reference_notl(long double x)
 {
     int r = !x;
     return r;
 }
-
-
diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h
index 7c751f68c5..78b245105e 100644
--- a/test_conformance/math_brute_force/reference_math.h
+++ b/test_conformance/math_brute_force/reference_math.h
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,223 +16,221 @@
 #ifndef REFERENCE_MATH_H
 #define REFERENCE_MATH_H
 
-#if defined( __APPLE__ )
-    #include <OpenCL/opencl.h>
+#if defined(__APPLE__)
+#include <OpenCL/opencl.h>
 #else
-    #include <CL/cl.h>
+#include <CL/cl.h>
 #endif
 
 // --  for testing float --
-double reference_sinh( double x );
-double reference_sqrt( double x );
-double reference_tanh( double x );
-double reference_acos( double );
-double reference_asin( double );
-double reference_atan( double );
-double reference_atan2( double, double );
-double reference_ceil( double );
-double reference_cosh( double );
-double reference_exp( double );
-double reference_fabs( double );
-double reference_acospi( double );
-double reference_asinpi( double );
-double reference_atanpi( double );
-double reference_atan2pi( double, double );
-double reference_cospi( double );
-double reference_divide( double, double );
-double reference_fract( double, double * );
-float  reference_fma( float, float, float, int );
-double reference_mad( double, double, double );
-double reference_nextafter(double, double );
-double reference_recip( double );
-double reference_rootn( double, int );
-double reference_rsqrt( double );
-double reference_sincos( double, double * );
-double reference_sinpi( double );
-double reference_tanpi( double );
+double reference_sinh(double x);
+double reference_sqrt(double x);
+double reference_tanh(double x);
+double reference_acos(double);
+double reference_asin(double);
+double reference_atan(double);
+double reference_atan2(double, double);
+double reference_ceil(double);
+double reference_cosh(double);
+double reference_exp(double);
+double reference_fabs(double);
+double reference_acospi(double);
+double reference_asinpi(double);
+double reference_atanpi(double);
+double reference_atan2pi(double, double);
+double reference_cospi(double);
+double reference_divide(double, double);
+double reference_fract(double, double*);
+float reference_fma(float, float, float, int);
+double reference_mad(double, double, double);
+double reference_nextafter(double, double);
+double reference_recip(double);
+double reference_rootn(double, int);
+double reference_rsqrt(double);
+double reference_sincos(double, double*);
+double reference_sinpi(double);
+double reference_tanpi(double);
 double reference_pow(double x, double y);
-double reference_pown( double, int );
-double reference_powr( double, double );
-double reference_cos( double );
-double reference_sin( double );
-double reference_tan( double );
-double reference_log( double );
-double reference_log10( double );
-double reference_modf( double, double *n );
-
-double reference_fdim( double, double );
-double reference_add( double, double );
-double reference_subtract( double, double );
-double reference_divide( double, double );
-double reference_multiply( double, double );
-double reference_remquo( double, double, int* );
-double reference_lgamma_r( double, int* );
-
-int reference_isequal( double, double );
-int reference_isfinite( double );
-int reference_isgreater( double, double );
-int reference_isgreaterequal( double, double );
-int reference_isinf( double );
-int reference_isless( double, double );
-int reference_islessequal( double, double );
-int reference_islessgreater( double, double );
-int reference_isnan( double );
-int reference_isnormal( double );
-int reference_isnotequal( double, double );
-int reference_isordered( double, double );
-int reference_isunordered( double, double );
-int reference_signbit( float );
-
-double reference_acosh( double x );
-double reference_asinh( double x );
-double reference_atanh( double x );
+double reference_pown(double, int);
+double reference_powr(double, double);
+double reference_cos(double);
+double reference_sin(double);
+double reference_tan(double);
+double reference_log(double);
+double reference_log10(double);
+double reference_modf(double, double* n);
+
+double reference_fdim(double, double);
+double reference_add(double, double);
+double reference_subtract(double, double);
+double reference_divide(double, double);
+double reference_multiply(double, double);
+double reference_remquo(double, double, int*);
+double reference_lgamma_r(double, int*);
+
+int reference_isequal(double, double);
+int reference_isfinite(double);
+int reference_isgreater(double, double);
+int reference_isgreaterequal(double, double);
+int reference_isinf(double);
+int reference_isless(double, double);
+int reference_islessequal(double, double);
+int reference_islessgreater(double, double);
+int reference_isnan(double);
+int reference_isnormal(double);
+int reference_isnotequal(double, double);
+int reference_isordered(double, double);
+int reference_isunordered(double, double);
+int reference_signbit(float);
+
+double reference_acosh(double x);
+double reference_asinh(double x);
+double reference_atanh(double x);
 double reference_cbrt(double x);
-float reference_copysign( float x, float y);
-double reference_copysignd( double x, double y);
-double reference_exp10( double );
-double reference_exp2( double x );
-double reference_expm1( double x );
-double reference_fmax( double x, double y );
-double reference_fmin( double x, double y );
-double reference_hypot( double x, double y );
-double reference_lgamma( double x);
-int    reference_ilogb( double );
-double reference_log2( double x );
-double reference_log1p( double x );
-double reference_logb( double x );
-double reference_maxmag( double x, double y );
-double reference_minmag( double x, double y );
-double reference_nan( cl_uint x );
-double reference_reciprocal( double x );
-double reference_remainder( double x, double y );
-double reference_rint( double x );
-double reference_round( double x );
-double reference_trunc( double x );
-double reference_floor( double x );
-double reference_fmod( double x, double y );
-double reference_frexp( double x, int *n );
-double reference_ldexp( double x, int n );
-
-double reference_assignment( double x );
-int    reference_not( double x );
+float reference_copysign(float x, float y);
+double reference_copysignd(double x, double y);
+double reference_exp10(double);
+double reference_exp2(double x);
+double reference_expm1(double x);
+double reference_fmax(double x, double y);
+double reference_fmin(double x, double y);
+double reference_hypot(double x, double y);
+double reference_lgamma(double x);
+int reference_ilogb(double);
+double reference_log2(double x);
+double reference_log1p(double x);
+double reference_logb(double x);
+double reference_maxmag(double x, double y);
+double reference_minmag(double x, double y);
+double reference_nan(cl_uint x);
+double reference_reciprocal(double x);
+double reference_remainder(double x, double y);
+double reference_rint(double x);
+double reference_round(double x);
+double reference_trunc(double x);
+double reference_floor(double x);
+double reference_fmod(double x, double y);
+double reference_frexp(double x, int* n);
+double reference_ldexp(double x, int n);
+
+double reference_assignment(double x);
+int reference_not(double x);
 // -- for testing fast-relaxed
 
 double reference_relaxed_acos(double);
 double reference_relaxed_asin(double);
 double reference_relaxed_atan(double);
-double reference_relaxed_mad( double, double, double );
-double reference_relaxed_divide( double x, double y );
-double reference_relaxed_sin( double x );
+double reference_relaxed_mad(double, double, double);
+double reference_relaxed_divide(double x, double y);
+double reference_relaxed_sin(double x);
 double reference_relaxed_sinpi(double x);
-double reference_relaxed_cos( double x );
+double reference_relaxed_cos(double x);
 double reference_relaxed_cospi(double x);
-double reference_relaxed_sincos( double x, double * y);
-double reference_relaxed_tan( double x );
-double reference_relaxed_exp( double x );
-double reference_relaxed_exp2( double x );
-double reference_relaxed_exp10( double x );
-double reference_relaxed_log( double x );
-double reference_relaxed_log2( double x );
+double reference_relaxed_sincos(double x, double* y);
+double reference_relaxed_tan(double x);
+double reference_relaxed_exp(double x);
+double reference_relaxed_exp2(double x);
+double reference_relaxed_exp10(double x);
+double reference_relaxed_log(double x);
+double reference_relaxed_log2(double x);
 double reference_relaxed_log10(double x);
-double reference_relaxed_pow( double x, double y);
-double reference_relaxed_reciprocal( double x );
+double reference_relaxed_pow(double x, double y);
+double reference_relaxed_reciprocal(double x);
 
 // -- for testing double --
 
-long double reference_sinhl( long double x );
-long double reference_sqrtl( long double x );
-long double reference_tanhl( long double x );
-long double reference_acosl( long double );
-long double reference_asinl( long double );
-long double reference_atanl( long double );
-long double reference_atan2l( long double, long double );
-long double reference_ceill( long double );
-long double reference_coshl( long double );
-long double reference_expl( long double );
-long double reference_fabsl( long double );
-long double reference_acospil( long double );
-long double reference_asinpil( long double );
-long double reference_atanpil( long double );
-long double reference_atan2pil( long double, long double );
-long double reference_cospil( long double );
-long double reference_dividel( long double, long double );
-long double reference_fractl( long double, long double * );
-long double reference_fmal( long double, long double, long double );
-long double reference_madl( long double, long double, long double );
-long double reference_nextafterl(long double, long double );
-long double reference_recipl( long double );
-long double reference_rootnl( long double, int );
-long double reference_rsqrtl( long double );
-long double reference_sincosl( long double, long double * );
-long double reference_sinpil( long double );
-long double reference_tanpil( long double );
+long double reference_sinhl(long double x);
+long double reference_sqrtl(long double x);
+long double reference_tanhl(long double x);
+long double reference_acosl(long double);
+long double reference_asinl(long double);
+long double reference_atanl(long double);
+long double reference_atan2l(long double, long double);
+long double reference_ceill(long double);
+long double reference_coshl(long double);
+long double reference_expl(long double);
+long double reference_fabsl(long double);
+long double reference_acospil(long double);
+long double reference_asinpil(long double);
+long double reference_atanpil(long double);
+long double reference_atan2pil(long double, long double);
+long double reference_cospil(long double);
+long double reference_dividel(long double, long double);
+long double reference_fractl(long double, long double*);
+long double reference_fmal(long double, long double, long double);
+long double reference_madl(long double, long double, long double);
+long double reference_nextafterl(long double, long double);
+long double reference_recipl(long double);
+long double reference_rootnl(long double, int);
+long double reference_rsqrtl(long double);
+long double reference_sincosl(long double, long double*);
+long double reference_sinpil(long double);
+long double reference_tanpil(long double);
 long double reference_powl(long double x, long double y);
-long double reference_pownl( long double, int );
-long double reference_powrl( long double, long double );
-long double reference_cosl( long double );
-long double reference_sinl(long double );
-long double reference_tanl( long double );
-long double reference_logl( long double );
-long double reference_log10l( long double );
-long double reference_modfl( long double, long double *n );
-
-
-long double reference_fdiml( long double, long double );
-long double reference_addl( long double, long double );
-long double reference_subtractl( long double, long double );
-long double reference_dividel( long double, long double );
-long double reference_multiplyl( long double, long double );
-long double reference_remquol( long double, long double, int* );
-long double reference_lgamma_rl( long double, int* );
-
-
-int reference_isequall( long double, long double );
-int reference_isfinitel( long double );
-int reference_isgreaterl( long double, long double );
-int reference_isgreaterequall( long double, long double );
-int reference_isinfl( long double );
-int reference_islessl( long double, long double );
-int reference_islessequall( long double, long double );
-int reference_islessgreaterl( long double, long double );
-int reference_isnanl( long double );
-int reference_isnormall( long double );
-int reference_isnotequall( long double, long double );
-int reference_isorderedl( long double, long double );
-int reference_isunorderedl( long double, long double );
-int reference_signbitl( long double );
-
-long double reference_acoshl( long double x );
-long double reference_asinhl( long double x );
-long double reference_atanhl( long double x );
+long double reference_pownl(long double, int);
+long double reference_powrl(long double, long double);
+long double reference_cosl(long double);
+long double reference_sinl(long double);
+long double reference_tanl(long double);
+long double reference_logl(long double);
+long double reference_log10l(long double);
+long double reference_modfl(long double, long double* n);
+
+
+long double reference_fdiml(long double, long double);
+long double reference_addl(long double, long double);
+long double reference_subtractl(long double, long double);
+long double reference_dividel(long double, long double);
+long double reference_multiplyl(long double, long double);
+long double reference_remquol(long double, long double, int*);
+long double reference_lgamma_rl(long double, int*);
+
+
+int reference_isequall(long double, long double);
+int reference_isfinitel(long double);
+int reference_isgreaterl(long double, long double);
+int reference_isgreaterequall(long double, long double);
+int reference_isinfl(long double);
+int reference_islessl(long double, long double);
+int reference_islessequall(long double, long double);
+int reference_islessgreaterl(long double, long double);
+int reference_isnanl(long double);
+int reference_isnormall(long double);
+int reference_isnotequall(long double, long double);
+int reference_isorderedl(long double, long double);
+int reference_isunorderedl(long double, long double);
+int reference_signbitl(long double);
+
+long double reference_acoshl(long double x);
+long double reference_asinhl(long double x);
+long double reference_atanhl(long double x);
 long double reference_cbrtl(long double x);
-long double reference_copysignl( long double x, long double y);
-long double reference_exp10l( long double );
-long double reference_exp2l( long double x );
-long double reference_expm1l( long double x );
-long double reference_fmaxl( long double x, long double y );
-long double reference_fminl( long double x, long double y );
-long double reference_hypotl( long double x, long double y );
-long double reference_lgammal( long double x);
-int    reference_ilogbl( long double );
-long double reference_log2l( long double x );
-long double reference_log1pl( long double x );
-long double reference_logbl( long double x );
-long double reference_maxmagl( long double x, long double y );
-long double reference_minmagl( long double x, long double y );
-long double reference_nanl( cl_ulong x );
-long double reference_reciprocall( long double x );
-long double reference_remainderl( long double x, long double y );
-long double reference_rintl( long double x );
-long double reference_roundl( long double x );
-long double reference_truncl( long double x );
-long double reference_floorl( long double x );
-long double reference_fmodl( long double x, long double y );
-long double reference_frexpl( long double x, int *n );
-long double reference_ldexpl( long double x, int n );
-
-long double reference_assignmentl( long double x );
-int reference_notl( long double x );
+long double reference_copysignl(long double x, long double y);
+long double reference_exp10l(long double);
+long double reference_exp2l(long double x);
+long double reference_expm1l(long double x);
+long double reference_fmaxl(long double x, long double y);
+long double reference_fminl(long double x, long double y);
+long double reference_hypotl(long double x, long double y);
+long double reference_lgammal(long double x);
+int reference_ilogbl(long double);
+long double reference_log2l(long double x);
+long double reference_log1pl(long double x);
+long double reference_logbl(long double x);
+long double reference_maxmagl(long double x, long double y);
+long double reference_minmagl(long double x, long double y);
+long double reference_nanl(cl_ulong x);
+long double reference_reciprocall(long double x);
+long double reference_remainderl(long double x, long double y);
+long double reference_rintl(long double x);
+long double reference_roundl(long double x);
+long double reference_truncl(long double x);
+long double reference_floorl(long double x);
+long double reference_fmodl(long double x, long double y);
+long double reference_frexpl(long double x, int* n);
+long double reference_ldexpl(long double x, int n);
+
+long double reference_assignmentl(long double x);
+int reference_notl(long double x);
 
 #endif
-
-
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index fd97a95df7..448a7c3db9 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -35,15 +35,29 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2,  __global float", sizeNames[vectorSize], "* in3 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-        "}\n"
-    };
-
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2 , __global float* in3)\n"
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2,  __global float",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2 , "
+        "__global float* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
@@ -51,12 +65,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       float3 f0 = vload3( 0, in + 3 * i );\n"
         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
         "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-        "       f0 = ", name, "( f0, f1, f2 );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
         "       vstore3( f0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       float3 f0, f1, f2;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -71,7 +89,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       f0 = ", name, "( f0, f1, f2 );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -86,16 +106,17 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -103,17 +124,31 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2,  __global double",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2,  __global double", sizeNames[vectorSize], "* in3 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-        "}\n"
-    };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2 , __global double* in3)\n"
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2 , "
+        "__global double* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
@@ -121,12 +156,16 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       double3 d0 = vload3( 0, in + 3 * i );\n"
         "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
         "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-        "       d0 = ", name, "( d0, d1, d2 );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
         "       vstore3( d0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       double3 d0, d1, d2;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -141,7 +180,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       d0 = ", name, "( d0, d1, d2 );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -156,42 +197,47 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
     };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -200,18 +246,85 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.75f, -1.5f, -1.25f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24), -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
-
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.75f, 1.5f, 1.25f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24), +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.75f,
+    -1.5f,
+    -1.25f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24),
+    -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.75f,
+    1.5f,
+    1.25f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24),
+    +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
@@ -219,23 +332,23 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int skipNanInf = (0 == strcmp( "fma", f->nameInCode )) && ! gInfNanSupport;
-    cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
+    int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
+    cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
     float float_ulps;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded )
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -243,469 +356,570 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
     /*
      for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-     if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-     return error;
+     if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs +
+     i) ) ) return error;
      */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
         j = 0;
-        if( i == 0 )
+        if (i == 0)
         { // test edge cases
             float *fp = (float *)gIn;
             float *fp2 = (float *)gIn2;
             float *fp3 = (float *)gIn3;
-            uint32_t x, y, z;  x = y = z = 0;
-            for( ; j < bufferSize / sizeof( float ); j++ )
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; j < bufferSize / sizeof(float); j++)
             {
                 fp[j] = specialValuesFloat[x];
                 fp2[j] = specialValuesFloat[y];
                 fp3[j] = specialValuesFloat[z];
 
-                if( ++x >= specialValuesFloatCount )
+                if (++x >= specialValuesFloatCount)
                 {
                     x = 0;
-                    if( ++y >= specialValuesFloatCount )
+                    if (++y >= specialValuesFloatCount)
                     {
                         y = 0;
-                        if( ++z >= specialValuesFloatCount )
-                            break;
+                        if (++z >= specialValuesFloatCount) break;
                     }
                 }
             }
-            if( j == bufferSize / sizeof( float ) )
-                vlog_error( "Test Error: not all special cases tested!\n" );
+            if (j == bufferSize / sizeof(float))
+                vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for( ; j < bufferSize / sizeof( float ); j++ )
+        for (; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
         float *s3 = (float *)gIn3;
-        if( skipNanInf )
+        if (skipNanInf)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
                 feclearexcept(FE_OVERFLOW);
-                r[j] = (float) f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED );
-                overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+                r[j] =
+                    (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
             }
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                r[j] = (float) f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED );
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                r[j] =
+                    (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
         }
 
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
                     float err;
                     int fail;
-                    float test = ((float*) q)[j];
-                    float correct = f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED );
+                    float test = ((float *)q)[j];
+                    float correct =
+                        f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
 
-                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                    if( skipNanInf )
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
+                    if (skipNanInf)
                     {
-                        if( overflow[j]                                         ||
-                           IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                           IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        ||
-                           IsFloatInfinity(s2[j])   || IsFloatNaN(s2[j])       ||
-                           IsFloatInfinity(s3[j])   || IsFloatNaN(s3[j])       )
+                        if (overflow[j] || IsFloatInfinity(correct)
+                            || IsFloatNaN(correct) || IsFloatInfinity(s[j])
+                            || IsFloatNaN(s[j]) || IsFloatInfinity(s2[j])
+                            || IsFloatNaN(s2[j]) || IsFloatInfinity(s3[j])
+                            || IsFloatNaN(s3[j]))
                             continue;
                     }
 
 
-                    err = Ulp_Error( test, correct );
-                    fail = ! (fabsf(err) <= float_ulps);
+                    err = Ulp_Error(test, correct);
+                    fail = !(fabsf(err) <= float_ulps);
 
-                    if( fail && ftz )
+                    if (fail && ftz)
                     {
                         float correct2, err2;
 
                         // retry per section 6.5.3.2  with flushing on
-                        if( 0.0f == test && 0.0f == f->func.f_fma( s[j], s2[j], s3[j], FLUSHED ) )
+                        if (0.0f == test
+                            && 0.0f
+                                == f->func.f_fma(s[j], s2[j], s3[j], FLUSHED))
                         {
                             fail = 0;
                             err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( fail && IsFloatSubnormal( s[j] ) )
+                        if (fail && IsFloatSubnormal(s[j]))
                         { // look at me,
                             float err3, correct3;
 
-                            if( skipNanInf )
-                                feclearexcept( FE_OVERFLOW );
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = f->func.f_fma( 0.0f, s2[j], s3[j], CORRECTLY_ROUNDED );
-                            correct3 = f->func.f_fma( -0.0f, s2[j], s3[j], CORRECTLY_ROUNDED );
+                            correct2 = f->func.f_fma(0.0f, s2[j], s3[j],
+                                                     CORRECTLY_ROUNDED);
+                            correct3 = f->func.f_fma(-0.0f, s2[j], s3[j],
+                                                     CORRECTLY_ROUNDED);
 
-                            if( skipNanInf )
+                            if (skipNanInf)
                             {
-                                if( fetestexcept( FE_OVERFLOW ) )
-                                    continue;
-
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                   IsFloatInfinity(correct3) || IsFloatNaN(correct3)   )
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps))
+                                    && (!(fabsf(err3) <= float_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( 0.0f == test &&
-                                ( 0.0f == f->func.f_fma(  0.0f, s2[j], s3[j], FLUSHED )  ||
-                                  0.0f == f->func.f_fma( -0.0f, s2[j], s3[j], FLUSHED ) )
-                              )
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(0.0f, s2[j], s3[j],
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(-0.0f, s2[j], s3[j],
+                                                         FLUSHED)))
                             {
                                 fail = 0;
                                 err = 0.0f;
                             }
 
-                            //try with first two args as zero
-                            if( IsFloatSubnormal( s2[j] ) )
+                            // try with first two args as zero
+                            if (IsFloatSubnormal(s2[j]))
                             { // its fun to have fun,
                                 double correct4, correct5;
                                 float err4, err5;
 
-                                if( skipNanInf )
-                                    feclearexcept( FE_OVERFLOW );
-
-                                correct2 = f->func.f_fma( 0.0f, 0.0f, s3[j], CORRECTLY_ROUNDED );
-                                correct3 = f->func.f_fma( -0.0f, 0.0f, s3[j], CORRECTLY_ROUNDED );
-                                correct4 = f->func.f_fma( 0.0f, -0.0f, s3[j], CORRECTLY_ROUNDED );
-                                correct5 = f->func.f_fma( -0.0f, -0.0f, s3[j], CORRECTLY_ROUNDED );
-
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                if( !gInfNanSupport )
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                correct2 = f->func.f_fma(0.0f, 0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
+                                correct3 = f->func.f_fma(-0.0f, 0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
+                                correct4 = f->func.f_fma(0.0f, -0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
+                                correct5 = f->func.f_fma(-0.0f, -0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) )
-                                        continue;
-
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                       IsFloatInfinity(correct3) || IsFloatNaN(correct3) ||
-                                       IsFloatInfinity(correct4) || IsFloatNaN(correct4) ||
-                                       IsFloatInfinity(correct5) || IsFloatNaN(correct5) )
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3)
+                                        || IsFloatInfinity(correct4)
+                                        || IsFloatNaN(correct4)
+                                        || IsFloatInfinity(correct5)
+                                        || IsFloatNaN(correct5))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                err4 = Ulp_Error( test, correct4  );
-                                err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) &&
-                                                 (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                err4 = Ulp_Error(test, correct4);
+                                err5 = Ulp_Error(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps))
+                                        && (!(fabsf(err3) <= float_ulps))
+                                        && (!(fabsf(err4) <= float_ulps))
+                                        && (!(fabsf(err5) <= float_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( 0.0f == test &&
-                                    ( 0.0f == f->func.f_fma(  0.0f,  0.0f, s3[j], FLUSHED )  ||
-                                      0.0f == f->func.f_fma( -0.0f,  0.0f, s3[j], FLUSHED )  ||
-                                      0.0f == f->func.f_fma(  0.0f, -0.0f, s3[j], FLUSHED )  ||
-                                      0.0f == f->func.f_fma( -0.0f, -0.0f, s3[j], FLUSHED )  )
-                                )
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, 0.0f, s3[j],
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, 0.0f, s3[j],
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, -0.0f, s3[j],
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, -0.0f,
+                                                             s3[j], FLUSHED)))
                                 {
                                     fail = 0;
                                     err = 0.0f;
                                 }
 
-                                if( IsFloatSubnormal( s3[j] )  )
+                                if (IsFloatSubnormal(s3[j]))
                                 {
-                                    if( test == 0.0f )  // 0*0+0 is 0
+                                    if (test == 0.0f) // 0*0+0 is 0
                                     {
                                         fail = 0;
                                         err = 0.0f;
                                     }
                                 }
                             }
-                            else if( IsFloatSubnormal( s3[j] ) )
+                            else if (IsFloatSubnormal(s3[j]))
                             {
                                 double correct4, correct5;
                                 float err4, err5;
 
-                                if( skipNanInf )
-                                    feclearexcept( FE_OVERFLOW );
-
-                                correct2 = f->func.f_fma( 0.0f, s2[j], 0.0f, CORRECTLY_ROUNDED );
-                                correct3 = f->func.f_fma( -0.0f, s2[j], 0.0f, CORRECTLY_ROUNDED );
-                                correct4 = f->func.f_fma( 0.0f,  s2[j], -0.0f, CORRECTLY_ROUNDED );
-                                correct5 = f->func.f_fma( -0.0f, s2[j], -0.0f, CORRECTLY_ROUNDED );
-
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                if( !gInfNanSupport )
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                correct2 = f->func.f_fma(0.0f, s2[j], 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct3 = f->func.f_fma(-0.0f, s2[j], 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct4 = f->func.f_fma(0.0f, s2[j], -0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct5 = f->func.f_fma(-0.0f, s2[j], -0.0f,
+                                                         CORRECTLY_ROUNDED);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) )
-                                        continue;
-
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                       IsFloatInfinity(correct3) || IsFloatNaN(correct3) ||
-                                       IsFloatInfinity(correct4) || IsFloatNaN(correct4) ||
-                                       IsFloatInfinity(correct5) || IsFloatNaN(correct5) )
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3)
+                                        || IsFloatInfinity(correct4)
+                                        || IsFloatNaN(correct4)
+                                        || IsFloatInfinity(correct5)
+                                        || IsFloatNaN(correct5))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                err4 = Ulp_Error( test, correct4  );
-                                err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) &&
-                                                 (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                err4 = Ulp_Error(test, correct4);
+                                err5 = Ulp_Error(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps))
+                                        && (!(fabsf(err3) <= float_ulps))
+                                        && (!(fabsf(err4) <= float_ulps))
+                                        && (!(fabsf(err5) <= float_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( 0.0f == test &&
-                                    (   0.0f == f->func.f_fma( 0.0f, s2[j], 0.0f, FLUSHED )   ||
-                                        0.0f == f->func.f_fma(-0.0f, s2[j], 0.0f, FLUSHED )   ||
-                                        0.0f == f->func.f_fma( 0.0f, s2[j],-0.0f, FLUSHED )   ||
-                                        0.0f == f->func.f_fma(-0.0f, s2[j],-0.0f, FLUSHED )   )
-                                )
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, s2[j], 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, s2[j], 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, s2[j], -0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, s2[j],
+                                                             -0.0f, FLUSHED)))
                                 {
                                     fail = 0;
                                     err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsFloatSubnormal( s2[j] ) )
+                        else if (fail && IsFloatSubnormal(s2[j]))
                         {
                             double correct2, correct3;
                             float err2, err3;
 
-                            if( skipNanInf )
-                                feclearexcept( FE_OVERFLOW );
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = f->func.f_fma( s[j], 0.0f, s3[j], CORRECTLY_ROUNDED );
-                            correct3 = f->func.f_fma( s[j], -0.0f, s3[j], CORRECTLY_ROUNDED );
+                            correct2 = f->func.f_fma(s[j], 0.0f, s3[j],
+                                                     CORRECTLY_ROUNDED);
+                            correct3 = f->func.f_fma(s[j], -0.0f, s3[j],
+                                                     CORRECTLY_ROUNDED);
 
-                            if( skipNanInf )
+                            if (skipNanInf)
                             {
-                                if( fetestexcept( FE_OVERFLOW ) )
-                                    continue;
-
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                   IsFloatInfinity(correct3) || IsFloatNaN(correct3)   )
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps))
+                                    && (!(fabsf(err3) <= float_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( 0.0f == test &&
-                                (   0.0f == f->func.f_fma( s[j], 0.0f, s3[j], FLUSHED )  ||
-                                    0.0f == f->func.f_fma( s[j], -0.0f, s3[j], FLUSHED ) )
-                            )
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(s[j], 0.0f, s3[j],
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(s[j], -0.0f, s3[j],
+                                                         FLUSHED)))
                             {
                                 fail = 0;
                                 err = 0.0f;
                             }
 
-                            //try with second two args as zero
-                            if( IsFloatSubnormal( s3[j] ) )
+                            // try with second two args as zero
+                            if (IsFloatSubnormal(s3[j]))
                             {
                                 double correct4, correct5;
                                 float err4, err5;
 
-                                if( skipNanInf )
-                                    feclearexcept( FE_OVERFLOW );
-
-                                correct2 = f->func.f_fma( s[j], 0.0f, 0.0f, CORRECTLY_ROUNDED );
-                                correct3 = f->func.f_fma( s[j], -0.0f, 0.0f, CORRECTLY_ROUNDED );
-                                correct4 = f->func.f_fma( s[j], 0.0f, -0.0f, CORRECTLY_ROUNDED );
-                                correct5 = f->func.f_fma( s[j], -0.0f, -0.0f, CORRECTLY_ROUNDED );
-
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                if( !gInfNanSupport )
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                correct2 = f->func.f_fma(s[j], 0.0f, 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct3 = f->func.f_fma(s[j], -0.0f, 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct4 = f->func.f_fma(s[j], 0.0f, -0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct5 = f->func.f_fma(s[j], -0.0f, -0.0f,
+                                                         CORRECTLY_ROUNDED);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) )
-                                        continue;
-
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                       IsFloatInfinity(correct3) || IsFloatNaN(correct3) ||
-                                       IsFloatInfinity(correct4) || IsFloatNaN(correct4) ||
-                                       IsFloatInfinity(correct5) || IsFloatNaN(correct5) )
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3)
+                                        || IsFloatInfinity(correct4)
+                                        || IsFloatNaN(correct4)
+                                        || IsFloatInfinity(correct5)
+                                        || IsFloatNaN(correct5))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                err4 = Ulp_Error( test, correct4  );
-                                err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) &&
-                                                 (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                err4 = Ulp_Error(test, correct4);
+                                err5 = Ulp_Error(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps))
+                                        && (!(fabsf(err3) <= float_ulps))
+                                        && (!(fabsf(err4) <= float_ulps))
+                                        && (!(fabsf(err5) <= float_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( 0.0f == test &&
-                                    (   0.0f == f->func.f_fma( s[j], 0.0f, 0.0f, FLUSHED )    ||
-                                        0.0f == f->func.f_fma( s[j],-0.0f, 0.0f, FLUSHED )    ||
-                                        0.0f == f->func.f_fma( s[j], 0.0f,-0.0f, FLUSHED )    ||
-                                        0.0f == f->func.f_fma( s[j],-0.0f,-0.0f, FLUSHED )    )
-                                )
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(s[j], 0.0f, 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(s[j], -0.0f, 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(s[j], 0.0f, -0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(s[j], -0.0f, -0.0f,
+                                                             FLUSHED)))
                                 {
                                     fail = 0;
                                     err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsFloatSubnormal(s3[j]) )
+                        else if (fail && IsFloatSubnormal(s3[j]))
                         {
                             double correct2, correct3;
                             float err2, err3;
 
-                            if( skipNanInf )
-                                feclearexcept( FE_OVERFLOW );
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = f->func.f_fma( s[j], s2[j], 0.0f, CORRECTLY_ROUNDED );
-                            correct3 = f->func.f_fma( s[j], s2[j], -0.0f, CORRECTLY_ROUNDED );
+                            correct2 = f->func.f_fma(s[j], s2[j], 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                            correct3 = f->func.f_fma(s[j], s2[j], -0.0f,
+                                                     CORRECTLY_ROUNDED);
 
-                            if( skipNanInf )
+                            if (skipNanInf)
                             {
-                                if( fetestexcept( FE_OVERFLOW ) )
-                                    continue;
-
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                   IsFloatInfinity(correct3) || IsFloatNaN(correct3)   )
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps))
+                                    && (!(fabsf(err3) <= float_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( 0.0f == test &&
-                                (   0.0f == f->func.f_fma( s[j], s2[j], 0.0f, FLUSHED ) ||
-                                    0.0f == f->func.f_fma( s[j], s2[j],-0.0f, FLUSHED )  )
-                            )
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(s[j], s2[j], 0.0f,
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(s[j], s2[j], -0.0f,
+                                                         FLUSHED)))
                             {
                                 fail = 0;
                                 err = 0.0f;
@@ -713,7 +927,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                         }
                     }
 
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
@@ -721,9 +935,14 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                         maxErrorVal3 = s3[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a, %a} ({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((cl_uint*)s)[j], ((cl_uint*)s2)[j], ((cl_uint*)s3)[j],  ((float*) gOut_Ref)[j], test );
+                        vlog_error(
+                            "\nERROR: %s%s: %f ulp error at {%a, %a, %a} "
+                            "({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n",
+                            f->name, sizeNames[k], err, s[j], s2[j], s3[j],
+                            ((cl_uint *)s)[j], ((cl_uint *)s2)[j],
+                            ((cl_uint *)s3)[j], ((float *)gOut_Ref)[j], test);
                         error = -1;
                         goto exit;
                     }
@@ -731,105 +950,135 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,  bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -840,18 +1089,75 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
-
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static const size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static const size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
 
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
@@ -860,8 +1166,8 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
@@ -869,7 +1175,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     double maxErrorVal3 = 0.0f;
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     Force64BitFPUPrecision();
@@ -877,360 +1183,463 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info ) ))
+                               &build_info)))
     {
         return error;
     }
     /*
      for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-     if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-     return error;
+     if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i,
+     programs + i) ) ) return error;
      */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
         j = 0;
-        if( i == 0 )
+        if (i == 0)
         { // test edge cases
-            uint32_t x, y, z;  x = y = z = 0;
-            for( ; j < bufferSize / sizeof( double ); j++ )
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; j < bufferSize / sizeof(double); j++)
             {
                 p[j] = specialValuesDouble[x];
                 p2[j] = specialValuesDouble[y];
                 p3[j] = specialValuesDouble[z];
-                if( ++x >= specialValuesDoubleCount )
+                if (++x >= specialValuesDoubleCount)
                 {
                     x = 0;
-                    if( ++y >= specialValuesDoubleCount )
+                    if (++y >= specialValuesDoubleCount)
                     {
                         y = 0;
-                        if( ++z >= specialValuesDoubleCount )
-                            break;
+                        if (++z >= specialValuesDoubleCount) break;
                     }
                 }
             }
-            if( j == bufferSize / sizeof( double ) )
-                vlog_error( "Test Error: not all special cases tested!\n" );
+            if (j == bufferSize / sizeof(double))
+                vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for( ; j < bufferSize / sizeof( double ); j++ )
+        for (; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            r[j] = (double) f->dfunc.f_fff( s[j], s2[j], s3[j] );
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    double test = ((double*) q)[j];
-                    long double correct = f->dfunc.f_fff( s[j], s2[j], s3[j] );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    int fail = ! (fabsf(err) <= f->double_ulps);
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    int fail = !(fabsf(err) <= f->double_ulps);
 
-                    if( fail && ftz )
+                    if (fail && ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleSubnormal(correct) )
+                        if (IsDoubleSubnormal(correct))
                         { // look at me,
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( fail && IsDoubleSubnormal( s[j] ) )
+                        if (fail && IsDoubleSubnormal(s[j]))
                         { // look at me,
-                            long double correct2 = f->dfunc.f_fff( 0.0, s2[j], s3[j] );
-                            long double correct3 = f->dfunc.f_fff( -0.0, s2[j], s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 =
+                                f->dfunc.f_fff(0.0, s2[j], s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(-0.0, s2[j], s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             { // look at me now,
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with first two args as zero
-                            if( IsDoubleSubnormal( s2[j] ) )
+                            // try with first two args as zero
+                            if (IsDoubleSubnormal(s2[j]))
                             { // its fun to have fun,
-                                correct2 = f->dfunc.f_fff( 0.0, 0.0, s3[j] );
-                                correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] );
-                                long double correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] );
-                                long double correct5 = f->dfunc.f_fff( -0.0, -0.0, s3[j] );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]);
+                                correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, -0.0, s3[j]);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, -0.0, s3[j]);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                   IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
 
-                                if( IsDoubleSubnormal( s3[j] )  )
+                                if (IsDoubleSubnormal(s3[j]))
                                 { // but you have to know how!
-                                    correct2 = f->dfunc.f_fff( 0.0, 0.0, 0.0f );
-                                    correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f );
-                                    correct4 = f->dfunc.f_fff( 0.0, -0.0, 0.0f );
-                                    correct5 = f->dfunc.f_fff( -0.0, -0.0, 0.0f );
-                                    long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f );
-                                    long double correct7 = f->dfunc.f_fff( -0.0, 0.0, -0.0f );
-                                    long double correct8 = f->dfunc.f_fff( 0.0, -0.0, -0.0f );
-                                    long double correct9 = f->dfunc.f_fff( -0.0, -0.0, -0.0f );
-                                    err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                    err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                    err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                    err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                    float err6 = Bruteforce_Ulp_Error_Double( test, correct6  );
-                                    float err7 = Bruteforce_Ulp_Error_Double( test, correct7  );
-                                    float err8 = Bruteforce_Ulp_Error_Double( test, correct8  );
-                                    float err9 = Bruteforce_Ulp_Error_Double( test, correct9  );
-                                    fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                     (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) &&
-                                                     (!(fabsf(err5) <= f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) &&
-                                                     (!(fabsf(err7) <= f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
-                                    if( fabsf( err6 ) < fabsf(err ) )
-                                        err = err6;
-                                    if( fabsf( err7 ) < fabsf(err ) )
-                                        err = err7;
-                                    if( fabsf( err8 ) < fabsf(err ) )
-                                        err = err8;
-                                    if( fabsf( err9 ) < fabsf(err ) )
-                                        err = err9;
+                                    correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f);
+                                    correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f);
+                                    correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f);
+                                    correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f);
+                                    long double correct6 =
+                                        f->dfunc.f_fff(0.0, 0.0, -0.0f);
+                                    long double correct7 =
+                                        f->dfunc.f_fff(-0.0, 0.0, -0.0f);
+                                    long double correct8 =
+                                        f->dfunc.f_fff(0.0, -0.0, -0.0f);
+                                    long double correct9 =
+                                        f->dfunc.f_fff(-0.0, -0.0, -0.0f);
+                                    err2 = Bruteforce_Ulp_Error_Double(
+                                        test, correct2);
+                                    err3 = Bruteforce_Ulp_Error_Double(
+                                        test, correct3);
+                                    err4 = Bruteforce_Ulp_Error_Double(
+                                        test, correct4);
+                                    err5 = Bruteforce_Ulp_Error_Double(
+                                        test, correct5);
+                                    float err6 = Bruteforce_Ulp_Error_Double(
+                                        test, correct6);
+                                    float err7 = Bruteforce_Ulp_Error_Double(
+                                        test, correct7);
+                                    float err8 = Bruteforce_Ulp_Error_Double(
+                                        test, correct8);
+                                    float err9 = Bruteforce_Ulp_Error_Double(
+                                        test, correct9);
+                                    fail = fail
+                                        && ((!(fabsf(err2) <= f->double_ulps))
+                                            && (!(fabsf(err3)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err4)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err6)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err7)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err8)
+                                                  <= f->double_ulps)));
+                                    if (fabsf(err2) < fabsf(err)) err = err2;
+                                    if (fabsf(err3) < fabsf(err)) err = err3;
+                                    if (fabsf(err4) < fabsf(err)) err = err4;
+                                    if (fabsf(err5) < fabsf(err)) err = err5;
+                                    if (fabsf(err6) < fabsf(err)) err = err6;
+                                    if (fabsf(err7) < fabsf(err)) err = err7;
+                                    if (fabsf(err8) < fabsf(err)) err = err8;
+                                    if (fabsf(err9) < fabsf(err)) err = err9;
 
                                     // retry per section 6.5.3.4
-                                    if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                       IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps )  ||
-                                       IsDoubleResultSubnormal( correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7, f->double_ulps )  ||
-                                       IsDoubleResultSubnormal( correct8, f->double_ulps ) || IsDoubleResultSubnormal( correct9, f->double_ulps )  )
+                                    if (IsDoubleResultSubnormal(correct2,
+                                                                f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct3, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct4, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct5, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct6, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct7, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct8, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct9, f->double_ulps))
                                     {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
+                                        fail = fail && (test != 0.0f);
+                                        if (!fail) err = 0.0f;
                                     }
                                 }
                             }
-                            else if( IsDoubleSubnormal( s3[j] ) )
+                            else if (IsDoubleSubnormal(s3[j]))
                             {
-                                correct2 = f->dfunc.f_fff( 0.0, s2[j], 0.0 );
-                                correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 );
-                                long double correct4 = f->dfunc.f_fff( 0.0,  s2[j], -0.0 );
-                                long double correct5 = f->dfunc.f_fff( -0.0, s2[j], -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0);
+                                correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, s2[j], -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, s2[j], -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                   IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsDoubleSubnormal( s2[j] ) )
+                        else if (fail && IsDoubleSubnormal(s2[j]))
                         {
-                            long double correct2 = f->dfunc.f_fff( s[j], 0.0, s3[j] );
-                            long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], 0.0, s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], -0.0, s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps )  || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with second two args as zero
-                            if( IsDoubleSubnormal( s3[j] ) )
+                            // try with second two args as zero
+                            if (IsDoubleSubnormal(s3[j]))
                             {
-                                correct2 = f->dfunc.f_fff( s[j], 0.0, 0.0 );
-                                correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 );
-                                long double correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 );
-                                long double correct5 = f->dfunc.f_fff( s[j], -0.0, -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0);
+                                correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(s[j], 0.0, -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(s[j], -0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                   IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsDoubleSubnormal(s3[j]) )
+                        else if (fail && IsDoubleSubnormal(s3[j]))
                         {
-                            long double correct2 = f->dfunc.f_fff( s[j], s2[j], 0.0 );
-                            long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], s2[j], 0.0);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], s2[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
 
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
@@ -1238,9 +1647,12 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                         maxErrorVal3 = s3[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: %f ulp error at {%.13la, %.13la, %.13la}: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((double*) gOut_Ref)[j], test );
+                        vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, "
+                                   "%.13la, %.13la}: *%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err, s[j], s2[j],
+                                   s3[j], ((double *)gOut_Ref)[j], test);
                         error = -1;
                         goto exit;
                     }
@@ -1248,107 +1660,136 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -1356,5 +1797,3 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
 
     return error;
 }
-
-
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 8ef33119c8..61a8546b40 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,8 +18,8 @@
 #include <string.h>
 #include "FunctionList.h"
 
-#if defined( __APPLE__ )
-    #include <sys/time.h>
+#if defined(__APPLE__)
+#include <sys/time.h>
 #endif
 
 int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode);
@@ -37,61 +37,77 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       f0 = ", name, "( f0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -101,63 +117,80 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
 {
-    const char *c[] = {     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in)\n"
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                        "       f0 = ", name, "( f0 );\n"
-                        "       vstore3( f0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       double3 f0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       f0 = ", name, "( f0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = f0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = f0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -165,91 +198,102 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
                              info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value.  Init to 0.
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double maxErrorValue; // position of the max error value.  Init to 0.
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
-
-    int         isRangeLimited;                     // 1 if the function is only to be evaluated over a range
-    float       half_sin_cos_tan_limit;
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isRangeLimited; // 1 if the function is only to be evaluated over a
+                        // range
+    float half_sin_cos_tan_limit;
     bool relaxedMode; // True if test is to be run in relaxed mode, false
                       // otherwise.
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
     int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
 
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -259,69 +303,89 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
-
     }
 
     // Check for special cases for unary float
     test_info.isRangeLimited = 0;
     test_info.half_sin_cos_tan_limit = 0;
-    if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") )
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
     {
         test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = 1.0f + test_info.ulps * (FLT_EPSILON/2.0f);             // out of range results from finite inputs must be in [-1,1]
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
     }
-    else if( 0 == strcmp( f->name, "half_tan"))
+    else if (0 == strcmp(f->name, "half_tan"))
     {
         test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = INFINITY;             // out of range resut from finite inputs must be numeric
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
     }
 
     // Init the kernels
@@ -330,141 +394,156 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting || skipTestingRelaxed)
+    if (!gSkipCorrectnessTesting || skipTestingRelaxed)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
 
-        if( skipTestingRelaxed )
+        if (skipTestingRelaxed)
         {
-          vlog(" (rlx skip correctness testing)\n");
-          goto exit;
+            vlog(" (rlx skip correctness testing)\n");
+            goto exit;
         }
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
-                ((float*)p)[j] = (float) genrand_real1(d);
-        else if( strstr( f->name, "log" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        if (strstr(f->name, "exp") || strstr(f->name, "sin")
+            || strstr(f->name, "cos") || strstr(f->name, "tan"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+                ((float *)p)[j] = (float)genrand_real1(d);
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = genrand_int32(d) & 0x7fffffff;
         else
-            for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError( test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime( endTime, startTime );
+                double current_time = SubtractTime(endTime, startTime);
                 sum += current_time;
-                if( current_time < bestTime )
-                    bestTime = current_time;
+                if (current_time < bestTime) bestTime = current_time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_float );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr    func = job->f->func;
-    const char * fname = job->f->name;
+    fptr func = job->f->func;
+    const char *fname = job->f->name;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     if (relaxedMode)
@@ -480,153 +559,177 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
     int ftz = job->ftz;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
     {
-      p[j] = base + j * scale;
-      if (relaxedMode)
-      {
-        float p_j = *(float *) &p[j];
-        if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 )  //the domain of the function is [-pi,pi]
+        p[j] = base + j * scale;
+        if (relaxedMode)
         {
-            if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN;
-        }
+            float p_j = *(float *)&p[j];
+            if (strcmp(fname, "sin") == 0
+                || strcmp(fname, "cos")
+                    == 0) // the domain of the function is [-pi,pi]
+            {
+                if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN;
+            }
 
-        if ( strcmp( fname, "reciprocal" ) == 0 )
-        {
-            const float l_limit = HEX_FLT(+, 1, 0, -, 126);
-            const float u_limit = HEX_FLT(+, 1, 0, +, 126);
+            if (strcmp(fname, "reciprocal") == 0)
+            {
+                const float l_limit = HEX_FLT(+, 1, 0, -, 126);
+                const float u_limit = HEX_FLT(+, 1, 0, +, 126);
 
-            if (fabs(p_j) < l_limit
-                || fabs(p_j)
-                    > u_limit) // the domain of the function is [2^-126,2^126]
-                ((float *)p)[j] = NAN;
+                if (fabs(p_j) < l_limit
+                    || fabs(p_j) > u_limit) // the domain of the function is
+                                            // [2^-126,2^126]
+                    ((float *)p)[j] = NAN;
+            }
         }
-      }
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     float *r = (float *)gOut_Ref + thread_id * buffer_elements;
     float *s = (float *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (float) func.f_f( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
-    //Verify data
+    // Verify data
     uint32_t *t = (uint32_t *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             uint32_t *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                float test = ((float*) q)[j];
-                double correct = func.f_f( s[j] );
-                float err = Ulp_Error( test, correct );
-                float abs_error = Abs_Error( test, correct );
+                float test = ((float *)q)[j];
+                double correct = func.f_f(s[j]);
+                float err = Ulp_Error(test, correct);
+                float abs_error = Abs_Error(test, correct);
                 int fail = 0;
                 int use_abs_error = 0;
 
-                // it is possible for the output to not match the reference result but for Ulp_Error
-                // to be zero, for example -1.#QNAN vs. 1.#QNAN. In such cases there is no failure
+                // it is possible for the output to not match the reference
+                // result but for Ulp_Error to be zero, for example -1.#QNAN
+                // vs. 1.#QNAN. In such cases there is no failure
                 if (err == 0.0f)
                 {
                     fail = 0;
                 }
                 else if (relaxedMode)
                 {
-                    if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 )
+                    if (strcmp(fname, "sin") == 0 || strcmp(fname, "cos") == 0)
                     {
-                        fail = ! (fabsf(abs_error) <= ulps);
+                        fail = !(fabsf(abs_error) <= ulps);
                         use_abs_error = 1;
                     }
                     if (strcmp(fname, "sinpi") == 0
@@ -639,12 +742,12 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
                         }
                     }
 
-                    if ( strcmp(fname, "reciprocal") == 0 )
+                    if (strcmp(fname, "reciprocal") == 0)
                     {
-                        fail = ! (fabsf(err) <= ulps);
+                        fail = !(fabsf(err) <= ulps);
                     }
 
-                    if ( strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0 )
+                    if (strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0)
                     {
                         float exp_error = ulps;
 
@@ -653,153 +756,171 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
                             exp_error += floor(fabs(2 * s[j]));
                         }
 
-                        fail = ! (fabsf(err) <= exp_error);
+                        fail = !(fabsf(err) <= exp_error);
                         ulps = exp_error;
                     }
-                    if (strcmp(fname, "tan") == 0) {
+                    if (strcmp(fname, "tan") == 0)
+                    {
 
-                        if(  !gFastRelaxedDerived )
+                        if (!gFastRelaxedDerived)
                         {
-                            fail = ! (fabsf(err) <= ulps);
+                            fail = !(fabsf(err) <= ulps);
                         }
-                        // Else fast math derived implementation does not require ULP verification
+                        // Else fast math derived implementation does not
+                        // require ULP verification
                     }
                     if (strcmp(fname, "exp10") == 0)
                     {
-                        if(  !gFastRelaxedDerived )
+                        if (!gFastRelaxedDerived)
                         {
-                            fail = ! (fabsf(err) <= ulps);
+                            fail = !(fabsf(err) <= ulps);
                         }
-                        // Else fast math derived implementation does not require ULP verification
+                        // Else fast math derived implementation does not
+                        // require ULP verification
                     }
                     if (strcmp(fname, "log") == 0 || strcmp(fname, "log2") == 0
                         || strcmp(fname, "log10") == 0)
                     {
-                        if( s[j] >= 0.5 && s[j] <= 2 )
+                        if (s[j] >= 0.5 && s[j] <= 2)
                         {
-                            fail = ! (fabsf(abs_error) <= ulps );
+                            fail = !(fabsf(abs_error) <= ulps);
                         }
                         else
                         {
-                            ulps = gIsEmbedded ? job->f->float_embedded_ulps : job->f->float_ulps;
-                            fail = ! (fabsf(err) <= ulps);
+                            ulps = gIsEmbedded ? job->f->float_embedded_ulps
+                                               : job->f->float_ulps;
+                            fail = !(fabsf(err) <= ulps);
                         }
                     }
 
 
                     // fast-relaxed implies finite-only
-                    if( IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                        IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        ) {
+                    if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                        || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
+                    {
                         fail = 0;
                         err = 0;
                     }
                 }
                 else
                 {
-                  fail = ! (fabsf(err) <= ulps);
+                    fail = !(fabsf(err) <= ulps);
                 }
 
                 // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
+                if (isRangeLimited
+                    && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
+                    && fabsf(s[j]) < INFINITY)
                 {
-                    if( fabsf( test ) <= half_sin_cos_tan_limit )
+                    if (fabsf(test) <= half_sin_cos_tan_limit)
                     {
                         err = 0;
                         fail = 0;
                     }
                 }
 
-                if( fail )
+                if (fail)
                 {
-                    if( ftz )
+                    if (ftz)
                     {
-                        typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
+                        typedef int (*CheckForSubnormal)(
+                            double, float); // If we are in fast relaxed math,
+                                            // we have a different calculation
+                                            // for the subnormal threshold.
                         CheckForSubnormal isFloatResultSubnormalPtr;
 
                         if (relaxedMode)
                         {
-                          isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
+                            isFloatResultSubnormalPtr =
+                                &IsFloatResultSubnormalAbsError;
                         }
                         else
                         {
-                          isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
+                            isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
                         }
                         // retry per section 6.5.3.2
-                        if( (*isFloatResultSubnormalPtr)(correct, ulps) )
+                        if ((*isFloatResultSubnormalPtr)(correct, ulps))
                         {
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
-                            double correct2 = func.f_f( 0.0 );
-                            double correct3 = func.f_f( -0.0 );
+                            double correct2 = func.f_f(0.0);
+                            double correct3 = func.f_f(-0.0);
                             float err2;
                             float err3;
-                            if( use_abs_error )
+                            if (use_abs_error)
                             {
-                              err2 = Abs_Error( test, correct2  );
-                              err3 = Abs_Error( test, correct3  );
+                                err2 = Abs_Error(test, correct2);
+                                err3 = Abs_Error(test, correct3);
                             }
                             else
                             {
-                              err2 = Ulp_Error( test, correct2  );
-                              err3 = Ulp_Error( test, correct3  );
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
                             }
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( (*isFloatResultSubnormalPtr)(correct2, ulps ) || (*isFloatResultSubnormalPtr)(correct3, ulps ) )
+                            if ((*isFloatResultSubnormalPtr)(correct2, ulps)
+                                || (*isFloatResultSubnormalPtr)(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
                 }
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at %a (0x%8.8x): *%a vs. %a\n", job->f->name, sizeNames[k], err, ((float*) s)[j], ((uint32_t*) s)[j], ((float*) t)[j], test);
+                    vlog_error("\nERROR: %s%s: %f ulp error at %a (0x%8.8x): "
+                               "*%a vs. %a\n",
+                               job->f->name, sizeNames[k], err, ((float *)s)[j],
+                               ((uint32_t *)s)[j], ((float *)t)[j], test);
                     return -1;
                 }
             }
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
@@ -808,17 +929,16 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
 }
 
 
-
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_double );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    float   ulps = job->ulps;
-    dptr    func = job->f->dfunc;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
     cl_uint j, k;
     cl_int error;
     int ftz = job->ftz;
@@ -826,190 +946,221 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_double *p = (cl_double*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        p[j] = DoubleFromUInt32( base + j * scale);
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
 
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
     cl_double *s = (cl_double *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_f( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
 
-    //Verify data
+    // Verify data
     cl_ulong *t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_f( s[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_f(s[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail )
+                if (fail)
                 {
-                    if( ftz )
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, ulps) )
+                        if (IsDoubleResultSubnormal(correct, ulps))
                         {
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
-                            long double correct2 = func.f_f( 0.0L );
-                            long double correct3 = func.f_f( -0.0L );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 = func.f_f(0.0L);
+                            long double correct3 = func.f_f(-0.0L);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal(correct2, ulps ) || IsDoubleResultSubnormal(correct3, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
                 }
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at %.13la (0x%16.16llx): *%.13la vs. %.13la\n", job->f->name, sizeNames[k], err, ((cl_double*) gIn)[j], ((cl_ulong*) gIn)[j], ((cl_double*) gOut_Ref)[j], test );
+                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
+                               "(0x%16.16llx): *%.13la vs. %.13la\n",
+                               job->f->name, sizeNames[k], err,
+                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
+                               ((cl_double *)gOut_Ref)[j], test);
                     return -1;
                 }
             }
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, buffer_elements, job->scale, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, buffer_elements, job->scale, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
@@ -1019,33 +1170,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
 
 int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-#if defined( __APPLE__ )
-    struct timeval  time_val;
-    gettimeofday( &time_val, NULL );
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+#if defined(__APPLE__)
+    struct timeval time_val;
+    gettimeofday(&time_val, NULL);
     double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
     double end_time;
 #endif
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -1058,52 +1212,69 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.ftz = f->ftz || gForceFTZ;
     test_info.relaxedMode = relaxedMode;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
     }
@@ -1114,136 +1285,147 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
-           goto exit;
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-#if defined( __APPLE__ )
-    gettimeofday( &time_val, NULL);
+#if defined(__APPLE__)
+    gettimeofday(&time_val, NULL);
     end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
 #endif
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
 
-        if( strstr( f->name, "exp" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
+        if (strstr(f->name, "exp"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
                 p[j] = (double)genrand_real1(d);
-        else if( strstr( f->name, "log" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
-                p[j] = fabs(DoubleFromUInt32( genrand_int32(d)));
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = fabs(DoubleFromUInt32(genrand_int32(d)));
         else
-            for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
-                p[j] = DoubleFromUInt32( genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime( endTime, startTime );
+                double current_time = SubtractTime(endTime, startTime);
                 sum += current_time;
-                if( current_time < bestTime )
-                    bestTime = current_time;
+                if (current_time < bestTime) bestTime = current_time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
 
-#if defined( __APPLE__ )
-    vlog( "\t(%2.2f seconds)", end_time - start_time );
+#if defined(__APPLE__)
+    vlog("\t(%2.2f seconds)", end_time - start_time);
 #endif
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
-
-
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index b170e09585..d468d26de0 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -32,64 +32,83 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* out2, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 iout = NAN;\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( iout, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 iout = NAN;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = iout.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = iout.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* out2, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 iout = NAN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 iout = NAN;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -98,91 +117,114 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* out2, __global double* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 iout = NAN;\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( iout, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 iout = NAN;\n"
-                            "       double3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = iout.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = iout.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* out2, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       double3 iout = NAN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 iout = NAN;\n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -194,20 +236,20 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     uint32_t j, k;
     uint32_t l;
     int error;
-    char const * testing_mode;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    char const *testing_mode;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError0 = 0.0f;
     float maxError1 = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal0 = 0.0f;
     float maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
-    cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
-    int isFract = 0 == strcmp( "fract", f->nameInCode );
-    int skipNanInf = isFract  && ! gInfNanSupport;
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
+    int isFract = 0 == strcmp("fract", f->nameInCode);
+    int skipNanInf = isFract && !gInfNanSupport;
     float float_ulps = getAllowedUlpError(f, relaxedMode);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
@@ -215,222 +257,256 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-          for( j = 0; j < bufferSize / sizeof( float ); j++ )
-          {
-            p[j] = (uint32_t) i + j * scale;
-            if (relaxedMode && strcmp(f->name, "sincos") == 0)
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
-              float pj = *(float *)&p[j];
-              if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                p[j] = (uint32_t)i + j * scale;
+                if (relaxedMode && strcmp(f->name, "sincos") == 0)
+                {
+                    float pj = *(float *)&p[j];
+                    if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                }
             }
-          }
         }
         else
         {
-          for( j = 0; j < bufferSize / sizeof( float ); j++ )
-          {
-            p[j] = (uint32_t) i + j;
-            if (relaxedMode && strcmp(f->name, "sincos") == 0)
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
-              float pj = *(float *)&p[j];
-              if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                p[j] = (uint32_t)i + j;
+                if (relaxedMode && strcmp(f->name, "sincos") == 0)
+                {
+                    float pj = *(float *)&p[j];
+                    if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                }
             }
-          }
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
         FPU_mode_type oldMode;
         RoundingMode oldRoundMode = kRoundToNearestEven;
-        if( isFract )
+        if (isFract)
         {
-            //Calculate the correctly rounded reference result
-            memset( &oldMode, 0, sizeof( oldMode ) );
-            if( ftz )
-                ForceFTZ( &oldMode );
+            // Calculate the correctly rounded reference result
+            memset(&oldMode, 0, sizeof(oldMode));
+            if (ftz) ForceFTZ(&oldMode);
 
             // Set the rounding mode to match the device
             if (gIsInRTZMode)
                 oldRoundMode = set_round(kRoundTowardZero, kfloat);
         }
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         float *r2 = (float *)gOut_Ref2;
         float *s = (float *)gIn;
 
-        if( skipNanInf )
+        if (skipNanInf)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
                 double dd;
                 feclearexcept(FE_OVERFLOW);
 
                 if (relaxedMode)
-                    r[j] = (float) f->rfunc.f_fpf( s[j], &dd );
+                    r[j] = (float)f->rfunc.f_fpf(s[j], &dd);
                 else
-                    r[j] = (float) f->func.f_fpf( s[j], &dd );
+                    r[j] = (float)f->func.f_fpf(s[j], &dd);
 
-                r2[j] = (float) dd;
-                overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+                r2[j] = (float)dd;
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
             }
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
                 double dd;
                 if (relaxedMode)
                     r[j] = (float)f->rfunc.f_fpf(s[j], &dd);
                 else
-                  r[j] = (float) f->func.f_fpf( s[j], &dd );
+                    r[j] = (float)f->func.f_fpf(s[j], &dd);
 
-                r2[j] = (float) dd;
+                r2[j] = (float)dd;
             }
         }
 
-        if( isFract && ftz )
-            RestoreFPState( &oldMode );
+        if (isFract && ftz) RestoreFPState(&oldMode);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
+        if (gSkipCorrectnessTesting)
         {
-            if (isFract && gIsInRTZMode)
-                (void)set_round(oldRoundMode, kfloat);
+            if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
             break;
         }
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         uint32_t *t2 = (uint32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)gOut[k];
                 uint32_t *q2 = (uint32_t *)gOut2[k];
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j]  )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
                     double correct, correct2;
                     float err, err2;
-                    float test = ((float*) q)[j];
-                    float test2 = ((float*) q2)[j];
+                    float test = ((float *)q)[j];
+                    float test2 = ((float *)q2)[j];
 
                     if (relaxedMode)
                         correct = f->rfunc.f_fpf(s[j], &correct2);
                     else
-                      correct = f->func.f_fpf( s[j], &correct2 );
+                        correct = f->func.f_fpf(s[j], &correct2);
 
-                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
                     if (relaxedMode || skipNanInf)
                     {
-                        if (skipNanInf && overflow[j])
-                            continue;
+                        if (skipNanInf && overflow[j]) continue;
 
-                        // Note: no double rounding here.  Reference functions calculate in single precision.
-                        if( IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                            IsFloatInfinity(correct2)|| IsFloatNaN(correct2)    ||
-                            IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        )
+                        // Note: no double rounding here.  Reference functions
+                        // calculate in single precision.
+                        if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                            || IsFloatInfinity(correct2) || IsFloatNaN(correct2)
+                            || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
                             continue;
                     }
 
-                    typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
+                    typedef int (*CheckForSubnormal)(
+                        double, float); // If we are in fast relaxed math, we
+                                        // have a different calculation for the
+                                        // subnormal threshold.
                     CheckForSubnormal isFloatResultSubnormalPtr;
                     if (relaxedMode)
                     {
-                      err = Abs_Error( test, correct);
-                      err2 = Abs_Error( test2, correct2);
-                      isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
+                        err = Abs_Error(test, correct);
+                        err2 = Abs_Error(test2, correct2);
+                        isFloatResultSubnormalPtr =
+                            &IsFloatResultSubnormalAbsError;
                     }
                     else
                     {
-                        err = Ulp_Error( test, correct );
-                        err2 = Ulp_Error( test2, correct2 );
+                        err = Ulp_Error(test, correct);
+                        err2 = Ulp_Error(test2, correct2);
                         isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
                     }
-                    int fail = ! (fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps);
+                    int fail = !(fabsf(err) <= float_ulps
+                                 && fabsf(err2) <= float_ulps);
 
-                    if( ftz )
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( (*isFloatResultSubnormalPtr)(correct, float_ulps) )
+                        if ((*isFloatResultSubnormalPtr)(correct, float_ulps))
                         {
-                            if( (*isFloatResultSubnormalPtr) (correct2, float_ulps ))
+                            if ((*isFloatResultSubnormalPtr)(correct2,
+                                                             float_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && test2 == 0.0f );
-                                if( ! fail )
+                                fail = fail && !(test == 0.0f && test2 == 0.0f);
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     err2 = 0.0f;
@@ -438,209 +514,251 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
                             }
                             else
                             {
-                                fail = fail && ! ( test == 0.0f && fabsf(err2) <= float_ulps);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && fabsf(err2) <= float_ulps);
+                                if (!fail) err = 0.0f;
                             }
                         }
-                        else if( (*isFloatResultSubnormalPtr)(correct2, float_ulps ) )
+                        else if ((*isFloatResultSubnormalPtr)(correct2,
+                                                              float_ulps))
                         {
-                            fail = fail && ! ( test2 == 0.0f && fabsf(err) <= float_ulps);
-                            if( ! fail )
-                                err2 = 0.0f;
+                            fail = fail
+                                && !(test2 == 0.0f && fabsf(err) <= float_ulps);
+                            if (!fail) err2 = 0.0f;
                         }
 
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
                             double correctp, correctn;
                             double correct2p, correct2n;
                             float errp, err2p, errn, err2n;
 
-                            if( skipNanInf )
-                                feclearexcept(FE_OVERFLOW);
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
                             if (relaxedMode)
                             {
-                              correctp = f->rfunc.f_fpf( 0.0, &correct2p );
-                              correctn = f->rfunc.f_fpf( -0.0, &correct2n );
+                                correctp = f->rfunc.f_fpf(0.0, &correct2p);
+                                correctn = f->rfunc.f_fpf(-0.0, &correct2n);
                             }
                             else
                             {
-                              correctp = f->func.f_fpf( 0.0, &correct2p );
-                              correctn = f->func.f_fpf( -0.0, &correct2n );
+                                correctp = f->func.f_fpf(0.0, &correct2p);
+                                correctn = f->func.f_fpf(-0.0, &correct2n);
                             }
 
-                            // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                            if( skipNanInf )
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (skipNanInf)
                             {
-                                if( fetestexcept(FE_OVERFLOW) )
-                                    continue;
-
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correctp) || IsFloatNaN(correctp)   ||
-                                    IsFloatInfinity(correctn) || IsFloatNaN(correctn)   ||
-                                    IsFloatInfinity(correct2p) || IsFloatNaN(correct2p) ||
-                                    IsFloatInfinity(correct2n) || IsFloatNaN(correct2n) )
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correctp)
+                                    || IsFloatNaN(correctp)
+                                    || IsFloatInfinity(correctn)
+                                    || IsFloatNaN(correctn)
+                                    || IsFloatInfinity(correct2p)
+                                    || IsFloatNaN(correct2p)
+                                    || IsFloatInfinity(correct2n)
+                                    || IsFloatNaN(correct2n))
                                     continue;
                             }
 
                             if (relaxedMode)
                             {
-                              errp = Abs_Error( test, correctp  );
-                              err2p = Abs_Error( test, correct2p  );
-                              errn = Abs_Error( test, correctn  );
-                              err2n = Abs_Error( test, correct2n  );
+                                errp = Abs_Error(test, correctp);
+                                err2p = Abs_Error(test, correct2p);
+                                errn = Abs_Error(test, correctn);
+                                err2n = Abs_Error(test, correct2n);
                             }
                             else
                             {
-                              errp = Ulp_Error( test, correctp  );
-                              err2p = Ulp_Error( test, correct2p  );
-                              errn = Ulp_Error( test, correctn  );
-                              err2n = Ulp_Error( test, correct2n  );
+                                errp = Ulp_Error(test, correctp);
+                                err2p = Ulp_Error(test, correct2p);
+                                errn = Ulp_Error(test, correctn);
+                                err2n = Ulp_Error(test, correct2n);
                             }
 
-                            fail =  fail && ((!(fabsf(errp) <= float_ulps)) && (!(fabsf(err2p) <= float_ulps))    &&
-                                            ((!(fabsf(errn) <= float_ulps)) && (!(fabsf(err2n) <= float_ulps))) );
-                            if( fabsf( errp ) < fabsf(err ) )
-                                err = errp;
-                            if( fabsf( errn ) < fabsf(err ) )
-                                err = errn;
-                            if( fabsf( err2p ) < fabsf(err2 ) )
-                                err2 = err2p;
-                            if( fabsf( err2n ) < fabsf(err2 ) )
-                                err2 = err2n;
+                            fail = fail
+                                && ((!(fabsf(errp) <= float_ulps))
+                                    && (!(fabsf(err2p) <= float_ulps))
+                                    && ((!(fabsf(errn) <= float_ulps))
+                                        && (!(fabsf(err2n) <= float_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
 
                             // retry per section 6.5.3.4
-                            if(  (*isFloatResultSubnormalPtr)( correctp, float_ulps ) || (*isFloatResultSubnormalPtr)( correctn, float_ulps )  )
+                            if ((*isFloatResultSubnormalPtr)(correctp,
+                                                             float_ulps)
+                                || (*isFloatResultSubnormalPtr)(correctn,
+                                                                float_ulps))
                             {
-                              if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
-                              {
-                                fail = fail && !( test == 0.0f && test2 == 0.0f);
-                                if( ! fail )
-                                  err = err2 = 0.0f;
-                              }
-                              else
-                              {
-                                fail = fail && ! (test == 0.0f && fabsf(err2) <= float_ulps);
-                                if( ! fail )
-                                  err = 0.0f;
-                              }
+                                if ((*isFloatResultSubnormalPtr)(correct2p,
+                                                                 float_ulps)
+                                    || (*isFloatResultSubnormalPtr)(correct2n,
+                                                                    float_ulps))
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f && test2 == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
+                                }
+                                else
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && fabsf(err2) <= float_ulps);
+                                    if (!fail) err = 0.0f;
+                                }
                             }
-                            else if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
+                            else if ((*isFloatResultSubnormalPtr)(correct2p,
+                                                                  float_ulps)
+                                     || (*isFloatResultSubnormalPtr)(
+                                         correct2n, float_ulps))
                             {
-                                fail = fail && ! (test2 == 0.0f && (fabsf(err) <= float_ulps));
-                                if( ! fail )
-                                    err2 = 0.0f;
+                                fail = fail
+                                    && !(test2 == 0.0f
+                                         && (fabsf(err) <= float_ulps));
+                                if (!fail) err2 = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError0 )
+                    if (fabsf(err) > maxError0)
                     {
                         maxError0 = fabsf(err);
                         maxErrorVal0 = s[j];
                     }
-                    if( fabsf(err2 ) > maxError1 )
+                    if (fabsf(err2) > maxError1)
                     {
                         maxError1 = fabsf(err2);
                         maxErrorVal1 = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: {%f, %f} ulp error at %a: *{%a, %a} vs. {%a, %a}\n", f->name, sizeNames[k], err, err2, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((float*) gOut_Ref2)[j], test, test2 );
-                      error = -1;
-                      goto exit;
+                        vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: "
+                                   "*{%a, %a} vs. {%a, %a}\n",
+                                   f->name, sizeNames[k], err, err2,
+                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
+                                   ((float *)gOut_Ref2)[j], test, test2);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if (isFract && gIsInRTZMode)
-            (void)set_round(oldRoundMode, kfloat);
+        if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog(".");
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
 
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -654,16 +772,16 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError0 = 0.0f;
     float maxError1 = 0.0f;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal0 = 0.0f;
     double maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -672,135 +790,163 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j);
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         double *r2 = (double *)gOut_Ref2;
         double *s = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
         {
             long double dd;
-            r[j] = (double) f->dfunc.f_fpf( s[j], &dd );
-            r2[j] = (double) dd;
+            r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
+            r2[j] = (double)dd;
         }
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         uint64_t *t2 = (uint64_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
                 uint64_t *q2 = (uint64_t *)(gOut2[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j]  )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
-                    double test = ((double*) q)[j];
-                    double test2 = ((double*) q2)[j];
+                    double test = ((double *)q)[j];
+                    double test2 = ((double *)q2)[j];
                     long double correct2;
-                    long double correct = f->dfunc.f_fpf( s[j], &correct2 );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    float err2 = Bruteforce_Ulp_Error_Double( test2, correct2 );
-                    int fail = ! (fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps);
-                    if( ftz )
+                    long double correct = f->dfunc.f_fpf(s[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
+                    int fail = !(fabsf(err) <= f->double_ulps
+                                 && fabsf(err2) <= f->double_ulps);
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
                         {
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && test2 == 0.0f );
-                                if( ! fail )
+                                fail = fail && !(test == 0.0f && test2 == 0.0f);
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     err2 = 0.0f;
@@ -808,168 +954,214 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
                             }
                             else
                             {
-                                fail = fail && ! ( test == 0.0f && fabsf(err2) <= f->double_ulps);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && fabsf(err2) <= f->double_ulps);
+                                if (!fail) err = 0.0f;
                             }
                         }
-                        else if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
+                        else if (IsDoubleResultSubnormal(correct2,
+                                                         f->double_ulps))
                         {
-                            fail = fail && ! ( test2 == 0.0f && fabsf(err) <= f->double_ulps);
-                            if( ! fail )
-                                err2 = 0.0f;
+                            fail = fail
+                                && !(test2 == 0.0f
+                                     && fabsf(err) <= f->double_ulps);
+                            if (!fail) err2 = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
                             long double correct2p, correct2n;
-                            long double correctp = f->dfunc.f_fpf( 0.0, &correct2p );
-                            long double correctn = f->dfunc.f_fpf( -0.0, &correct2n );
-                            float errp = Bruteforce_Ulp_Error_Double( test, correctp  );
-                            float err2p = Bruteforce_Ulp_Error_Double( test, correct2p  );
-                            float errn = Bruteforce_Ulp_Error_Double( test, correctn  );
-                            float err2n = Bruteforce_Ulp_Error_Double( test, correct2n  );
-                            fail =  fail && ((!(fabsf(errp) <= f->double_ulps)) && (!(fabsf(err2p) <= f->double_ulps))    &&
-                                            ((!(fabsf(errn) <= f->double_ulps)) && (!(fabsf(err2n) <= f->double_ulps))) );
-                            if( fabsf( errp ) < fabsf(err ) )
-                                err = errp;
-                            if( fabsf( errn ) < fabsf(err ) )
-                                err = errn;
-                            if( fabsf( err2p ) < fabsf(err2 ) )
-                                err2 = err2p;
-                            if( fabsf( err2n ) < fabsf(err2 ) )
-                                err2 = err2n;
+                            long double correctp =
+                                f->dfunc.f_fpf(0.0, &correct2p);
+                            long double correctn =
+                                f->dfunc.f_fpf(-0.0, &correct2n);
+                            float errp =
+                                Bruteforce_Ulp_Error_Double(test, correctp);
+                            float err2p =
+                                Bruteforce_Ulp_Error_Double(test, correct2p);
+                            float errn =
+                                Bruteforce_Ulp_Error_Double(test, correctn);
+                            float err2n =
+                                Bruteforce_Ulp_Error_Double(test, correct2n);
+                            fail = fail
+                                && ((!(fabsf(errp) <= f->double_ulps))
+                                    && (!(fabsf(err2p) <= f->double_ulps))
+                                    && ((!(fabsf(errn) <= f->double_ulps))
+                                        && (!(fabsf(err2n)
+                                              <= f->double_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correctp, f->double_ulps ) || IsDoubleResultSubnormal( correctn, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correctp,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correctn,
+                                                           f->double_ulps))
                             {
-                                if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2p,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct2n,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && !( test == 0.0f && test2 == 0.0f);
-                                    if( ! fail )
-                                        err = err2 = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f && test2 == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
                                 }
                                 else
                                 {
-                                    fail = fail && ! (test == 0.0f && fabsf(err2) <= f->double_ulps);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && fabsf(err2) <= f->double_ulps);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
-                            else if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
+                            else if (IsDoubleResultSubnormal(correct2p,
+                                                             f->double_ulps)
+                                     || IsDoubleResultSubnormal(correct2n,
+                                                                f->double_ulps))
                             {
-                                fail = fail && ! (test2 == 0.0f && (fabsf(err) <= f->double_ulps));
-                                if( ! fail )
-                                    err2 = 0.0f;
+                                fail = fail
+                                    && !(test2 == 0.0f
+                                         && (fabsf(err) <= f->double_ulps));
+                                if (!fail) err2 = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError0 )
+                    if (fabsf(err) > maxError0)
                     {
                         maxError0 = fabsf(err);
                         maxErrorVal0 = s[j];
                     }
-                    if( fabsf(err2 ) > maxError1 )
+                    if (fabsf(err2) > maxError1)
                     {
                         maxError1 = fabsf(err2);
                         maxErrorVal1 = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: *{%.13la, %.13la} vs. {%.13la, %.13la}\n", f->name, sizeNames[k], err, err2, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((double*) gOut_Ref2)[j], test, test2 );
-                      error = -1;
-                      goto exit;
+                        vlog_error(
+                            "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
+                            "*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
+                            f->name, sizeNames[k], err, err2,
+                            ((double *)gIn)[j], ((double *)gOut_Ref)[j],
+                            ((double *)gOut_Ref2)[j], test, test2);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
-        double *p = (double*) gIn;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            p[j] = DoubleFromUInt32(genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        // Init input array
+        double *p = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
 
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -977,6 +1169,3 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index 153268825d..c71de0ed39 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -34,63 +34,82 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 iout = INT_MIN;\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( iout, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       int3 iout = INT_MIN;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = iout.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = iout.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 iout = INT_MIN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 iout = INT_MIN;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -99,97 +118,120 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in)\n"
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                        "       int3 iout = INT_MIN;\n"
-                        "       f0 = ", name, "( f0, &iout );\n"
-                        "       vstore3( f0, 0, out + 3*i );\n"
-                        "       vstore3( iout, 0, out2 + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       int3 iout = INT_MIN;\n"
-                        "       double3 f0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       f0 = ", name, "( f0, &iout );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = f0.y; \n"
-                        "               out2[3*i+1] = iout.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = f0.x; \n"
-                        "               out2[3*i] = iout.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global int* out2, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 iout = INT_MIN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 iout = INT_MIN;\n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
 }
 
-cl_ulong  abs_cl_long( cl_long i );
-cl_ulong  abs_cl_long( cl_long i )
+cl_ulong abs_cl_long(cl_long i);
+cl_ulong abs_cl_long(cl_long i)
 {
     cl_long mask = i >> 63;
     return (i ^ mask) - mask;
@@ -200,22 +242,22 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     float float_ulps;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
-    cl_ulong  maxiError;
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    cl_ulong maxiError;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded )
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -225,147 +267,179 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j * scale;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         float *s = (float *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = (float) f->func.f_fpI( s[j], r2+j );
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fpI(s[j], r2 + j);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)(gOut2[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j] )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
-                    float test = ((float*) q)[j];
+                    float test = ((float *)q)[j];
                     int correct2 = INT_MIN;
-                    double correct = f->func.f_fpI( s[j], &correct2 );
-                    float err = Ulp_Error( test, correct );
-                    cl_long iErr = (int64_t) q2[j] - (int64_t) correct2;
-                    int fail = ! (fabsf(err) <= float_ulps && abs_cl_long( iErr ) <= maxiError );
-                    if( ftz )
+                    double correct = f->func.f_fpI(s[j], &correct2);
+                    float err = Ulp_Error(test, correct);
+                    cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
+                    int fail = !(fabsf(err) <= float_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsFloatResultSubnormal(correct, float_ulps ) )
+                        if (IsFloatResultSubnormal(correct, float_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
                             int correct5, correct6;
-                            double correct3 = f->func.f_fpI( 0.0, &correct5 );
-                            double correct4 = f->func.f_fpI( -0.0, &correct6 );
-                            float err2 = Ulp_Error( test, correct3  );
-                            float err3 = Ulp_Error( test, correct4  );
-                            cl_long iErr2 = (long long) q2[j] - (long long) correct5;
-                            cl_long iErr3 = (long long) q2[j] - (long long) correct6;
+                            double correct3 = f->func.f_fpI(0.0, &correct5);
+                            double correct4 = f->func.f_fpI(-0.0, &correct6);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            cl_long iErr2 =
+                                (long long)q2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)q2[j] - (long long)correct6;
 
                             // Did +0 work?
-                            if( fabsf(err2) <= float_ulps && abs_cl_long( iErr2 ) <= maxiError )
+                            if (fabsf(err2) <= float_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
                             {
                                 err = err2;
                                 iErr = iErr2;
                                 fail = 0;
                             }
                             // Did -0 work?
-                            else if(fabsf(err3) <= float_ulps && abs_cl_long( iErr3 ) <= maxiError)
+                            else if (fabsf(err3) <= float_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
                             {
                                 err = err3;
                                 iErr = iErr3;
@@ -373,10 +447,17 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
                             }
 
                             // retry per section 6.5.3.4
-                            if( fail && (IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps )) )
+                            if (fail
+                                && (IsFloatResultSubnormal(correct2, float_ulps)
+                                    || IsFloatResultSubnormal(correct3,
+                                                              float_ulps)))
                             {
-                                fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
-                                if( ! fail )
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     iErr = 0;
@@ -384,20 +465,24 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: {%f, %d} ulp error at %a: *{%a, %d} vs. {%a, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
+                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
+                                   "*{%a, %d} vs. {%a, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
+                                   ((int *)gOut_Ref2)[j], test, q2[j]);
                         error = -1;
                         goto exit;
                     }
@@ -405,88 +490,109 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -500,18 +606,18 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
-    cl_ulong  maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     uint64_t step = getTestStep(sizeof(double), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -520,151 +626,185 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j);
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         double *s = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            r[j] = (double) f->dfunc.f_fpI( s[j], r2+j );
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fpI(s[j], r2 + j);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)(gOut2[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j] )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
-                    double test = ((double*) q)[j];
+                    double test = ((double *)q)[j];
                     int correct2 = INT_MIN;
-                    long double correct = f->dfunc.f_fpI( s[j], &correct2 );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    cl_long iErr = (long long) q2[j] - (long long) correct2;
-                    int fail = ! (fabsf(err) <= f->double_ulps && abs_cl_long( iErr ) <= maxiError );
-                    if( ftz )
+                    long double correct = f->dfunc.f_fpI(s[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    cl_long iErr = (long long)q2[j] - (long long)correct2;
+                    int fail = !(fabsf(err) <= f->double_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
                             int correct5, correct6;
-                            long double correct3 = f->dfunc.f_fpI( 0.0, &correct5 );
-                            long double correct4 = f->dfunc.f_fpI( -0.0, &correct6 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            cl_long iErr2 = (long long) q2[j] - (long long) correct5;
-                            cl_long iErr3 = (long long) q2[j] - (long long) correct6;
+                            long double correct3 =
+                                f->dfunc.f_fpI(0.0, &correct5);
+                            long double correct4 =
+                                f->dfunc.f_fpI(-0.0, &correct6);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            cl_long iErr2 =
+                                (long long)q2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)q2[j] - (long long)correct6;
 
                             // Did +0 work?
-                            if( fabsf(err2) <= f->double_ulps && abs_cl_long( iErr2 ) <= maxiError )
+                            if (fabsf(err2) <= f->double_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
                             {
                                 err = err2;
                                 iErr = iErr2;
                                 fail = 0;
                             }
                             // Did -0 work?
-                            else if(fabsf(err3) <= f->double_ulps && abs_cl_long( iErr3 ) <= maxiError)
+                            else if (fabsf(err3) <= f->double_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
                             {
                                 err = err3;
                                 iErr = iErr3;
@@ -672,10 +812,18 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
                             }
 
                             // retry per section 6.5.3.4
-                            if( fail && (IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )) )
+                            if (fail
+                                && (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)))
                             {
-                                fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
-                                if( ! fail )
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     iErr = 0;
@@ -683,20 +831,24 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: {%f, %d} ulp error at %.13la: *{%.13la, %d} vs. {%.13la, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
+                        vlog_error("\nERROR: %sD%s: {%f, %d} ulp error at "
+                                   "%.13la: *{%.13la, %d} vs. {%.13la, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   ((double *)gIn)[j], ((double *)gOut_Ref)[j],
+                                   ((int *)gOut_Ref2)[j], test, q2[j]);
                         error = -1;
                         goto exit;
                     }
@@ -704,91 +856,111 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
 
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
             p[j] = DoubleFromUInt32(genrand_int32(d));
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -796,6 +968,3 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 97fd25f96c..397ff877ee 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -33,61 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global uint", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global uint* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       uint3 u0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f0 = ", name, "( u0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       uint3 u0;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( u0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global uint",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global uint* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       uint3 u0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       uint3 u0;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( u0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -95,90 +111,110 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global ulong", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global ulong* in)\n"
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global ulong",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
-                        "       double3 f0 = ", name, "( u0 );\n"
-                        "       vstore3( f0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       ulong3 u0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               u0 = (ulong3)( in[3*i], in[3*i+1], 0xdeaddeaddeaddeadUL ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       double3 f0 = ", name, "( u0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = f0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = f0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                         "__kernel void math_kernel",
+                         sizeNames[vectorSize],
+                         "( __global double* out, __global ulong* in)\n"
+                         "{\n"
+                         "   size_t i = get_global_id(0);\n"
+                         "   if( i + 1 < get_global_size(0) )\n"
+                         "   {\n"
+                         "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
+                         "       double3 f0 = ",
+                         name,
+                         "( u0 );\n"
+                         "       vstore3( f0, 0, out + 3*i );\n"
+                         "   }\n"
+                         "   else\n"
+                         "   {\n"
+                         "       size_t parity = i & 1;   // Figure out how "
+                         "many elements are left over after BUFFER_SIZE % "
+                         "(3*sizeof(float)). Assume power of two buffer size \n"
+                         "       ulong3 u0;\n"
+                         "       switch( parity )\n"
+                         "       {\n"
+                         "           case 1:\n"
+                         "               u0 = (ulong3)( in[3*i], "
+                         "0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
+                         "               break;\n"
+                         "           case 0:\n"
+                         "               u0 = (ulong3)( in[3*i], in[3*i+1], "
+                         "0xdeaddeaddeaddeadUL ); \n"
+                         "               break;\n"
+                         "       }\n"
+                         "       double3 f0 = ",
+                         name,
+                         "( u0 );\n"
+                         "       switch( parity )\n"
+                         "       {\n"
+                         "           case 0:\n"
+                         "               out[3*i+1] = f0.y; \n"
+                         "               // fall through\n"
+                         "           case 1:\n"
+                         "               out[3*i] = f0.x; \n"
+                         "               break;\n"
+                         "       }\n"
+                         "   }\n"
+                         "}\n" };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -189,22 +225,22 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
     int isRangeLimited = 0;
     float float_ulps;
     float half_sin_cos_tan_limit = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded)
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -212,240 +248,282 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") )
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
     {
         isRangeLimited = 1;
-        half_sin_cos_tan_limit = 1.0f + float_ulps * (FLT_EPSILON/2.0f);             // out of range results from finite inputs must be in [-1,1]
+        half_sin_cos_tan_limit = 1.0f
+            + float_ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
     }
-    else if( 0 == strcmp( f->name, "half_tan"))
+    else if (0 == strcmp(f->name, "half_tan"))
     {
         isRangeLimited = 1;
-        half_sin_cos_tan_limit = INFINITY;             // out of range resut from finite inputs must be numeric
+        half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
     }
 
 
-    for( i = 0; i < (1ULL<<32); i += step  )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j * scale;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILURE -- could not execute kernel\n" );
+                vlog_error("FAILURE -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
-        float *r = (float*) gOut_Ref;
-        cl_uint *s = (cl_uint*) gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = (float) f->func.f_u( s[j] );
+        // Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        cl_uint *s = (cl_uint *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_u(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
 
-        //Verify data
-        uint32_t *t = (uint32_t*) gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
-                uint32_t *q = (uint32_t*)(gOut[k]);
+                uint32_t *q = (uint32_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    float test = ((float*) q)[j];
-                    double correct = f->func.f_u( s[j] );
-                    float err = Ulp_Error( test, correct );
-                    int fail = ! (fabsf(err) <= float_ulps);
+                    float test = ((float *)q)[j];
+                    double correct = f->func.f_u(s[j]);
+                    float err = Ulp_Error(test, correct);
+                    int fail = !(fabsf(err) <= float_ulps);
 
                     // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                    if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
+                    if (isRangeLimited
+                        && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
+                        && fabsf(s[j]) < INFINITY)
                     {
-                        if( fabsf( test ) <= half_sin_cos_tan_limit )
+                        if (fabsf(test) <= half_sin_cos_tan_limit)
                         {
                             err = 0;
                             fail = 0;
                         }
                     }
 
-                     if( fail )
+                    if (fail)
                     {
-                        if( ftz )
+                        if (ftz)
                         {
                             // retry per section 6.5.3.2
-                            if( IsFloatResultSubnormal(correct, float_ulps) )
+                            if (IsFloatResultSubnormal(correct, float_ulps))
                             {
-                                fail = fail && ( test != 0.0f );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", f->name, sizeNames[k], err, ((uint32_t*) gIn)[j], ((float*) gOut_Ref)[j], test );
-                      error = -1;
+                        vlog_error(
+                            "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n",
+                            f->name, sizeNames[k], err, ((uint32_t *)gIn)[j],
+                            ((float *)gOut_Ref)[j], test);
+                        error = -1;
                         goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
-        uint32_t *p = (uint32_t*)gIn;
-        if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                ((float*)p)[j] = (float) genrand_real1(d);
-        else if( strstr( f->name, "log" ) )
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if (strstr(f->name, "exp") || strstr(f->name, "sin")
+            || strstr(f->name, "cos") || strstr(f->name, "tan"))
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                ((float *)p)[j] = (float)genrand_real1(d);
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < bufferSize / sizeof(float); j++)
                 p[j] = genrand_int32(d) & 0x7fffffff;
         else
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
                 p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILURE -- could not execute kernel\n" );
+                    vlog_error("FAILURE -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -454,9 +532,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_ulong random64( MTdata d )
+static cl_ulong random64(MTdata d)
 {
-    return (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
+    return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
 }
 
 int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
@@ -464,12 +542,12 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
@@ -479,211 +557,243 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step  )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         cl_ulong *p = (cl_ulong *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_ulong ); j++ )
-            p[j] = random64(d);
+        for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error( "FAILURE -- could not execute kernel\n" );
+                vlog_error("FAILURE -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
-        double *r = (double*) gOut_Ref;
-        cl_ulong *s = (cl_ulong*) gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-            r[j] = (double) f->dfunc.f_u( s[j] );
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        cl_ulong *s = (cl_ulong *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            r[j] = (double)f->dfunc.f_u(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
 
-        //Verify data
-        uint64_t *t = (uint64_t*) gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
-                uint64_t *q = (uint64_t*)(gOut[k]);
+                uint64_t *q = (uint64_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    double test = ((double*) q)[j];
-                    long double correct = f->dfunc.f_u( s[j] );
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_u(s[j]);
                     float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    int fail = ! (fabsf(err) <= f->double_ulps);
+                    int fail = !(fabsf(err) <= f->double_ulps);
 
                     // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                    if( fail )
+                    if (fail)
                     {
-                        if( ftz )
+                        if (ftz)
                         {
                             // retry per section 6.5.3.2
-                            if( IsDoubleResultSubnormal(correct, f->double_ulps) )
+                            if (IsDoubleResultSubnormal(correct,
+                                                        f->double_ulps))
                             {
-                                fail = fail && ( test != 0.0 );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\n%s%sD: %f ulp error at 0x%16.16llx: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, ((uint64_t*) gIn)[j], ((double*) gOut_Ref)[j], test );
-                      error = -1;
+                        vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
+                                   "*%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err,
+                                   ((uint64_t *)gIn)[j],
+                                   ((double *)gOut_Ref)[j], test);
+                        error = -1;
                         goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
-        double *p = (double*) gIn;
+        // Init input array
+        double *p = (double *)gIn;
 
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            p[j] = random64(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILURE -- could not execute kernel\n" );
+                    vlog_error("FAILURE -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -691,4 +801,3 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-

From 0b6fbd15d1f6427d85215945ca58e904c543066d Mon Sep 17 00:00:00 2001
From: AlexBinXie <54727243+AlexXAmd@users.noreply.github.com>
Date: Thu, 14 Jan 2021 08:27:45 -0500
Subject: [PATCH 017/158] Use glFinish to replace glFlush (#1102)

In OpenCL spec 1.1:
"Prior to calling clEnqueueAcquireGLObjects, the application must ensure that any pending GL
operations which access the objects specified in mem_objects have completed. This may be
accomplished portably by issuing and waiting for completion of a glFinish command on all GL
contexts with pending references to these objects."

Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>
---
 test_conformance/gl/test_buffers.cpp       | 2 +-
 test_conformance/gl/test_image_methods.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test_conformance/gl/test_buffers.cpp b/test_conformance/gl/test_buffers.cpp
index f11590fb91..35f01ee6bb 100644
--- a/test_conformance/gl/test_buffers.cpp
+++ b/test_conformance/gl/test_buffers.cpp
@@ -184,7 +184,7 @@ int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType
     glBufferData( GL_ARRAY_BUFFER, bufferSize, outDataGL, GL_STATIC_DRAW );
 
     glBindBuffer( GL_ARRAY_BUFFER, 0 );
-    glFlush();
+    glFinish();
 
 
     /* Generate some streams. The first and last ones are GL, middle one just vanilla CL */
diff --git a/test_conformance/gl/test_image_methods.cpp b/test_conformance/gl/test_image_methods.cpp
index afaa08fc9a..07f5b65e83 100644
--- a/test_conformance/gl/test_image_methods.cpp
+++ b/test_conformance/gl/test_image_methods.cpp
@@ -287,10 +287,10 @@ int test_image_format_methods( cl_device_id device, cl_context context, cl_comma
     error = clSetKernelArg( kernel, 1, sizeof( outDataBuffer ), &outDataBuffer );
     test_error( error, "Unable to set kernel argument" );
 
-  // Flush and Acquire.
-  glFlush();
-  error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &image, 0, NULL, NULL);
-  test_error( error, "Unable to acquire GL obejcts");
+    // Finish and Acquire.
+    glFinish();
+    error = (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &image, 0, NULL, NULL);
+    test_error(error, "Unable to acquire GL obejcts");
 
     size_t threads[1] = { 1 }, localThreads[1] = { 1 };
 

From 03a0989998dfd968e773b160c66eee924ce193ab Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Thu, 14 Jan 2021 08:27:59 -0500
Subject: [PATCH 018/158] Use std::vector for format lists in images suite
 (#1105)

* Use std::vector for format lists in images suite

Avoids memory deallocation issues and generally simplifies the code.

* Fixup formatting with git-clang-format
---
 test_common/harness/imageHelpers.cpp          |  26 ++--
 test_common/harness/imageHelpers.h            |  22 ++--
 .../images/clCopyImage/test_loops.cpp         |  24 +---
 .../images/clFillImage/test_loops.cpp         |  26 +---
 .../images/clGetInfo/test_loops.cpp           |  24 +---
 .../images/clReadWriteImage/test_loops.cpp    |  63 +++++-----
 test_conformance/images/common.cpp            |  20 +--
 test_conformance/images/common.h              |   9 +-
 .../kernel_image_methods/test_loops.cpp       |  23 +---
 .../kernel_read_write/test_iterations.cpp     |   7 +-
 .../images/kernel_read_write/test_loops.cpp   |  53 +++-----
 .../images/kernel_read_write/test_read_1D.cpp |   7 +-
 .../kernel_read_write/test_read_1D_array.cpp  |   7 +-
 .../kernel_read_write/test_read_2D_array.cpp  |   7 +-
 .../images/kernel_read_write/test_read_3D.cpp |   7 +-
 .../kernel_read_write/test_write_1D.cpp       |   5 +-
 .../kernel_read_write/test_write_1D_array.cpp |   5 +-
 .../kernel_read_write/test_write_2D_array.cpp |   5 +-
 .../kernel_read_write/test_write_3D.cpp       |   5 +-
 .../kernel_read_write/test_write_image.cpp    |  39 ++++--
 .../samplerlessReads/test_iterations.cpp      |   7 +-
 .../images/samplerlessReads/test_loops.cpp    | 114 +++++++++---------
 .../images/samplerlessReads/test_read_1D.cpp  |   7 +-
 .../samplerlessReads/test_read_1D_array.cpp   |   7 +-
 .../samplerlessReads/test_read_1D_buffer.cpp  |   7 +-
 .../samplerlessReads/test_read_2D_array.cpp   |   7 +-
 .../images/samplerlessReads/test_read_3D.cpp  |   7 +-
 test_conformance/images/testBase.h            |  27 +++--
 28 files changed, 279 insertions(+), 288 deletions(-)

diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index b785f64ded..db9cf3f6ee 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -269,7 +269,7 @@ int is_format_signed(const cl_image_format *format)
     }
 }
 
-uint32_t get_pixel_size(cl_image_format *format)
+uint32_t get_pixel_size(const cl_image_format *format)
 {
     switch (format->image_channel_data_type)
     {
@@ -330,7 +330,7 @@ uint32_t next_power_of_two(uint32_t v)
     return v;
 }
 
-uint32_t get_pixel_alignment(cl_image_format *format)
+uint32_t get_pixel_alignment(const cl_image_format *format)
 {
     return next_power_of_two(get_pixel_size(format));
 }
@@ -533,7 +533,7 @@ bool is_sRGBA_order(cl_channel_order image_channel_order)
 
 // Format helpers
 
-int has_alpha(cl_image_format *format)
+int has_alpha(const cl_image_format *format)
 {
     switch (format->image_channel_order)
     {
@@ -586,7 +586,7 @@ void get_max_sizes(
     size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize,
     const cl_ulong maxIndividualAllocSize, // CL_DEVICE_MAX_MEM_ALLOC_SIZE
     const cl_ulong maxTotalAllocSize, // CL_DEVICE_GLOBAL_MEM_SIZE
-    cl_mem_object_type image_type, cl_image_format *format,
+    cl_mem_object_type image_type, const cl_image_format *format,
     int usingMaxPixelSizeBuffer)
 {
 
@@ -797,7 +797,7 @@ void get_max_sizes(
     }
 }
 
-float get_max_absolute_error(cl_image_format *format,
+float get_max_absolute_error(const cl_image_format *format,
                              image_sampler_data *sampler)
 {
     if (sampler->filter_mode == CL_FILTER_NEAREST) return 0.0f;
@@ -816,7 +816,7 @@ float get_max_absolute_error(cl_image_format *format,
     }
 }
 
-float get_max_relative_error(cl_image_format *format,
+float get_max_relative_error(const cl_image_format *format,
                              image_sampler_data *sampler, int is3D,
                              int isLinearFilter)
 {
@@ -899,7 +899,7 @@ float get_max_relative_error(cl_image_format *format,
     return maxError;
 }
 
-size_t get_format_max_int(cl_image_format *format)
+size_t get_format_max_int(const cl_image_format *format)
 {
     switch (format->image_channel_data_type)
     {
@@ -932,7 +932,7 @@ size_t get_format_max_int(cl_image_format *format)
     }
 }
 
-int get_format_min_int(cl_image_format *format)
+int get_format_min_int(const cl_image_format *format)
 {
     switch (format->image_channel_data_type)
     {
@@ -1247,7 +1247,7 @@ void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
         return;
     }
 
-    cl_image_format *format = imageInfo->format;
+    const cl_image_format *format = imageInfo->format;
 
     unsigned int i;
     float tempData[4];
@@ -3571,8 +3571,8 @@ cl_float CoordWalker::Get(size_t idx, size_t el)
 }
 
 
-void print_read_header(cl_image_format *format, image_sampler_data *sampler,
-                       bool err, int t)
+void print_read_header(const cl_image_format *format,
+                       image_sampler_data *sampler, bool err, int t)
 {
     const char *addressMode = NULL;
     const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" };
@@ -3638,7 +3638,7 @@ void print_read_header(cl_image_format *format, image_sampler_data *sampler,
     }
 }
 
-void print_write_header(cl_image_format *format, bool err = false)
+void print_write_header(const cl_image_format *format, bool err = false)
 {
     if (err)
         log_error("[%-7s %-24s %d]\n",
@@ -3653,7 +3653,7 @@ void print_write_header(cl_image_format *format, bool err = false)
 }
 
 
-void print_header(cl_image_format *format, bool err = false)
+void print_header(const cl_image_format *format, bool err = false)
 {
     if (err)
     {
diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h
index 87595094eb..8544fbfa16 100644
--- a/test_common/harness/imageHelpers.h
+++ b/test_common/harness/imageHelpers.h
@@ -76,11 +76,11 @@ int round_to_even(float v);
 #define CONVERT_UINT(v, max, max_val)                                          \
     (v < 0 ? 0 : (v > max ? max_val : round_to_even(v)))
 
-extern void print_read_header(cl_image_format *format,
+extern void print_read_header(const cl_image_format *format,
                               image_sampler_data *sampler, bool err = false,
                               int t = 0);
-extern void print_write_header(cl_image_format *format, bool err);
-extern void print_header(cl_image_format *format, bool err);
+extern void print_write_header(const cl_image_format *format, bool err);
+extern void print_header(const cl_image_format *format, bool err);
 extern bool find_format(cl_image_format *formatList, unsigned int numFormats,
                         cl_image_format *formatToFind);
 extern bool is_image_format_required(cl_image_format format, cl_mem_flags flags,
@@ -98,7 +98,7 @@ extern uint32_t get_channel_order_channel_count(cl_channel_order order);
 cl_channel_type get_channel_type_from_name(const char *name);
 cl_channel_order get_channel_order_from_name(const char *name);
 extern int is_format_signed(const cl_image_format *format);
-extern uint32_t get_pixel_size(cl_image_format *format);
+extern uint32_t get_pixel_size(const cl_image_format *format);
 
 /* Helper to get any ol image format as long as it is 8-bits-per-channel */
 extern int get_8_bit_image_format(cl_context context,
@@ -123,7 +123,7 @@ typedef struct
     size_t rowPitch;
     size_t slicePitch;
     size_t arraySize;
-    cl_image_format *format;
+    const cl_image_format *format;
     cl_mem buffer;
     cl_mem_object_type type;
     cl_uint num_mip_levels;
@@ -139,9 +139,9 @@ void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
                    size_t maxDepth, size_t maxArraySize,
                    const cl_ulong maxIndividualAllocSize,
                    const cl_ulong maxTotalAllocSize,
-                   cl_mem_object_type image_type, cl_image_format *format,
+                   cl_mem_object_type image_type, const cl_image_format *format,
                    int usingMaxPixelSize = 0);
-extern size_t get_format_max_int(cl_image_format *format);
+extern size_t get_format_max_int(const cl_image_format *format);
 
 extern cl_ulong get_image_size(image_descriptor const *imageInfo);
 extern cl_ulong get_image_size_mb(image_descriptor const *imageInfo);
@@ -173,7 +173,7 @@ extern void copy_image_data(image_descriptor *srcImageInfo,
                             void *destImageValues, const size_t sourcePos[],
                             const size_t destPos[], const size_t regionSize[]);
 
-int has_alpha(cl_image_format *format);
+int has_alpha(const cl_image_format *format);
 
 extern bool is_sRGBA_order(cl_channel_order image_channel_order);
 
@@ -240,7 +240,7 @@ void read_image_pixel(void *imageData, image_descriptor *imageInfo, int x,
         return;
     }
 
-    cl_image_format *format = imageInfo->format;
+    const cl_image_format *format = imageInfo->format;
 
     unsigned int i;
     T tempData[4];
@@ -662,9 +662,9 @@ extern char *create_random_image_data(ExplicitType dataType,
 
 extern void get_sampler_kernel_code(image_sampler_data *imageSampler,
                                     char *outLine);
-extern float get_max_absolute_error(cl_image_format *format,
+extern float get_max_absolute_error(const cl_image_format *format,
                                     image_sampler_data *sampler);
-extern float get_max_relative_error(cl_image_format *format,
+extern float get_max_relative_error(const cl_image_format *format,
                                     image_sampler_data *sampler, int is3D,
                                     int isLinearFilter);
 
diff --git a/test_conformance/images/clCopyImage/test_loops.cpp b/test_conformance/images/clCopyImage/test_loops.cpp
index 03f34be7bb..6ee1e536ad 100644
--- a/test_conformance/images/clCopyImage/test_loops.cpp
+++ b/test_conformance/images/clCopyImage/test_loops.cpp
@@ -105,25 +105,14 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
     int ret = 0;
 
     // Grab the list of supported image formats for integer reads
-    cl_image_format *formatList;
-    bool *filterFlags;
-    unsigned int numFormats;
+    std::vector<cl_image_format> formatList;
+    if (get_format_list(context, imageType, formatList, flags)) return -1;
 
-    if( get_format_list( context, imageType, formatList, numFormats, flags ) )
-        return -1;
-
-    filterFlags = new bool[ numFormats ];
-    if( filterFlags == NULL )
-    {
-        log_error( "ERROR: Out of memory allocating filter flags list!\n" );
-        return -1;
-    }
-    memset( filterFlags, 0, sizeof( bool ) * numFormats );
-
-    filter_formats(formatList, filterFlags, numFormats, NULL);
+    std::vector<bool> filterFlags(formatList.size(), false);
+    filter_formats(formatList, filterFlags, nullptr);
 
     // Run the format list
-    for( unsigned int i = 0; i < numFormats; i++ )
+    for (unsigned int i = 0; i < formatList.size(); i++)
     {
         int test_return = 0;
         if( filterFlags[i] )
@@ -168,9 +157,6 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
         ret += test_return;
     }
 
-    delete filterFlags;
-    delete formatList;
-
     return ret;
 }
 
diff --git a/test_conformance/images/clFillImage/test_loops.cpp b/test_conformance/images/clFillImage/test_loops.cpp
index 3ab696ef12..759f48d2f3 100644
--- a/test_conformance/images/clFillImage/test_loops.cpp
+++ b/test_conformance/images/clFillImage/test_loops.cpp
@@ -69,35 +69,22 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
     int ret = 0;
 
     // Grab the list of supported image formats
-    cl_image_format *formatList;
-    bool *filterFlags;
-    unsigned int numFormats;
-
-    if ( get_format_list( context, imageType, formatList, numFormats, flags ) )
-        return -1;
-
-    filterFlags = new bool[ numFormats ];
-    if ( filterFlags == NULL )
-    {
-        log_error( "ERROR: Out of memory allocating filter flags list!\n" );
-        return -1;
-    }
-    memset( filterFlags, 0, sizeof( bool ) * numFormats );
+    std::vector<cl_image_format> formatList;
+    if (get_format_list(context, imageType, formatList, flags)) return -1;
 
     for (auto test : imageTestTypes)
     {
         if (gTypesToTest & test.type)
         {
-            if (filter_formats(formatList, filterFlags, numFormats,
-                               test.channelTypes)
-                == 0)
+            std::vector<bool> filterFlags(formatList.size(), false);
+            if (filter_formats(formatList, filterFlags, test.channelTypes) == 0)
             {
                 log_info("No formats supported for %s type\n", test.name);
             }
             else
             {
                 // Run the format list
-                for (unsigned int i = 0; i < numFormats; i++)
+                for (unsigned int i = 0; i < formatList.size(); i++)
                 {
                     if (filterFlags[i])
                     {
@@ -125,9 +112,6 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
         }
     }
 
-    delete[] filterFlags;
-    delete[] formatList;
-
     return ret;
 }
 
diff --git a/test_conformance/images/clGetInfo/test_loops.cpp b/test_conformance/images/clGetInfo/test_loops.cpp
index 0abb14bf99..17f02d8b46 100644
--- a/test_conformance/images/clGetInfo/test_loops.cpp
+++ b/test_conformance/images/clGetInfo/test_loops.cpp
@@ -29,28 +29,14 @@ int test_image_type( cl_device_id device, cl_context context, cl_mem_object_type
     int ret = 0;
 
     // Grab the list of supported image formats for integer reads
-    cl_image_format *formatList;
-    bool *filterFlags;
-    unsigned int numFormats;
+    std::vector<cl_image_format> formatList;
+    if (get_format_list(context, image_type, formatList, flags)) return -1;
 
-    if ( get_format_list( context, image_type, formatList, numFormats, flags ) )
-        return -1;
-
-    BufferOwningPtr<cl_image_format> formatListBuf(formatList);
-
-    filterFlags = new bool[ numFormats ];
-    BufferOwningPtr<bool> filterFlagsBuf(filterFlags);
-
-    if( filterFlags == NULL )
-    {
-        log_error( "ERROR: Out of memory allocating filter flags list!\n" );
-        return -1;
-    }
-    memset( filterFlags, 0, sizeof( bool ) * numFormats );
-    filter_formats( formatList, filterFlags, numFormats, 0 );
+    std::vector<bool> filterFlags(formatList.size(), false);
+    filter_formats(formatList, filterFlags, nullptr);
 
     // Run the format list
-    for( unsigned int i = 0; i < numFormats; i++ )
+    for (unsigned int i = 0; i < formatList.size(); i++)
     {
         int test_return = 0;
         if( filterFlags[i] )
diff --git a/test_conformance/images/clReadWriteImage/test_loops.cpp b/test_conformance/images/clReadWriteImage/test_loops.cpp
index 10fb7a7d44..782e4b37d5 100644
--- a/test_conformance/images/clReadWriteImage/test_loops.cpp
+++ b/test_conformance/images/clReadWriteImage/test_loops.cpp
@@ -40,50 +40,43 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
 
     int ret = 0;
 
-    // Grab the list of supported image formats for integer reads
-    cl_image_format *formatList;
-    bool *filterFlags;
-    unsigned int numFormats;
-
-  if ( gTestMipmaps )
-  {
-    if ( 0 == is_extension_available( device, "cl_khr_mipmap_image" ))
+    if (gTestMipmaps)
     {
-      log_info( "-----------------------------------------------------\n" );
-      log_info( "This device does not support cl_khr_mipmap_image.\nSkipping mipmapped image test. \n" );
-      log_info( "-----------------------------------------------------\n\n" );
-      return 0;
+        if (0 == is_extension_available(device, "cl_khr_mipmap_image"))
+        {
+            log_info("-----------------------------------------------------\n");
+            log_info("This device does not support "
+                     "cl_khr_mipmap_image.\nSkipping mipmapped image test. \n");
+            log_info(
+                "-----------------------------------------------------\n\n");
+            return 0;
+        }
     }
-  }
 
-    if( get_format_list( context, imageType, formatList, numFormats, flags ) )
-        return -1;
+    // Grab the list of supported image formats for integer reads
+    std::vector<cl_image_format> formatList;
+    if (get_format_list(context, imageType, formatList, flags)) return -1;
 
-    filterFlags = new bool[ numFormats ];
-    if( filterFlags == NULL )
-    {
-        log_error( "ERROR: Out of memory allocating filter flags list!\n" );
-        return -1;
-    }
-    memset( filterFlags, 0, sizeof( bool ) * numFormats );
-    filter_formats( formatList, filterFlags, numFormats, 0 );
+    std::vector<bool> filterFlags(formatList.size(), false);
+    filter_formats(formatList, filterFlags, nullptr);
 
     // Run the format list
-    for( unsigned int i = 0; i < numFormats; i++ )
+    for (unsigned int i = 0; i < formatList.size(); i++)
     {
         int test_return = 0;
-        if( filterFlags[i] )
+        if (filterFlags[i])
         {
-            log_info( "NOT RUNNING: " );
-            print_header( &formatList[ i ], false );
+            log_info("NOT RUNNING: ");
+            print_header(&formatList[i], false);
             continue;
         }
 
-        print_header( &formatList[ i ], false );
+        print_header(&formatList[i], false);
 
         gTestCount++;
 
-        switch (imageType) {
+        switch (imageType)
+        {
             case CL_MEM_OBJECT_IMAGE1D:
                 test_return = test_read_image_set_1D(device, context, queue,
                                                      &formatList[i], flags);
@@ -106,19 +99,17 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
                 break;
         }
 
-        if (test_return) {
+        if (test_return)
+        {
             gFailCount++;
-            log_error( "FAILED: " );
-            print_header( &formatList[ i ], true );
-            log_info( "\n" );
+            log_error("FAILED: ");
+            print_header(&formatList[i], true);
+            log_info("\n");
         }
 
         ret += test_return;
     }
 
-    delete[] filterFlags;
-    delete[] formatList;
-
     return ret;
 }
 
diff --git a/test_conformance/images/common.cpp b/test_conformance/images/common.cpp
index a14242efe3..7323f11c1c 100644
--- a/test_conformance/images/common.cpp
+++ b/test_conformance/images/common.cpp
@@ -58,13 +58,13 @@ std::array<ImageTestTypes, 3> imageTestTypes = { {
     { kTestFloat, kFloat, floatFormats, "float" },
 } };
 
-int filter_formats(cl_image_format *formatList, bool *filterFlags,
-                   unsigned int formatCount,
+int filter_formats(const std::vector<cl_image_format> &formatList,
+                   std::vector<bool> &filterFlags,
                    cl_channel_type *channelDataTypesToFilter,
                    bool testMipmaps /*=false*/)
 {
     int numSupported = 0;
-    for (unsigned int j = 0; j < formatCount; j++)
+    for (unsigned int j = 0; j < formatList.size(); j++)
     {
         // If this format has been previously filtered, remove the filter
         if (filterFlags[j]) filterFlags[j] = false;
@@ -129,18 +129,18 @@ int filter_formats(cl_image_format *formatList, bool *filterFlags,
 }
 
 int get_format_list(cl_context context, cl_mem_object_type imageType,
-                    cl_image_format *&outFormatList,
-                    unsigned int &outFormatCount, cl_mem_flags flags)
+                    std::vector<cl_image_format> &outFormatList,
+                    cl_mem_flags flags)
 {
+    cl_uint formatCount;
     int error = clGetSupportedImageFormats(context, flags, imageType, 0, NULL,
-                                           &outFormatCount);
+                                           &formatCount);
     test_error(error, "Unable to get count of supported image formats");
 
-    outFormatList =
-        (outFormatCount > 0) ? new cl_image_format[outFormatCount] : NULL;
+    outFormatList.resize(formatCount);
 
-    error = clGetSupportedImageFormats(context, flags, imageType,
-                                       outFormatCount, outFormatList, NULL);
+    error = clGetSupportedImageFormats(context, flags, imageType, formatCount,
+                                       outFormatList.data(), NULL);
     test_error(error, "Unable to get list of supported image formats");
     return 0;
 }
diff --git a/test_conformance/images/common.h b/test_conformance/images/common.h
index 7ae2f4fa52..27e8679be4 100644
--- a/test_conformance/images/common.h
+++ b/test_conformance/images/common.h
@@ -22,6 +22,7 @@
 #include "harness/conversions.h"
 
 #include <array>
+#include <vector>
 
 extern cl_channel_type gChannelTypeToUse;
 extern cl_channel_order gChannelOrderToUse;
@@ -40,13 +41,13 @@ struct ImageTestTypes
 
 extern std::array<ImageTestTypes, 3> imageTestTypes;
 
-int filter_formats(cl_image_format *formatList, bool *filterFlags,
-                   unsigned int formatCount,
+int filter_formats(const std::vector<cl_image_format> &formatList,
+                   std::vector<bool> &filterFlags,
                    cl_channel_type *channelDataTypesToFilter,
                    bool testMipmaps = false);
 int get_format_list(cl_context context, cl_mem_object_type imageType,
-                    cl_image_format *&outFormatList,
-                    unsigned int &outFormatCount, cl_mem_flags flags);
+                    std::vector<cl_image_format> &outFormatList,
+                    cl_mem_flags flags);
 size_t random_in_ranges(size_t minimum, size_t rangeA, size_t rangeB, MTdata d);
 
 #endif // IMAGES_COMMON_H
diff --git a/test_conformance/images/kernel_image_methods/test_loops.cpp b/test_conformance/images/kernel_image_methods/test_loops.cpp
index 4c7b93e8b9..1d892a9b0b 100644
--- a/test_conformance/images/kernel_image_methods/test_loops.cpp
+++ b/test_conformance/images/kernel_image_methods/test_loops.cpp
@@ -42,24 +42,14 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
     int ret = 0;
 
     // Grab the list of supported image formats for integer reads
-    cl_image_format *formatList;
-    bool *filterFlags;
-    unsigned int numFormats;
+    std::vector<cl_image_format> formatList;
+    if (get_format_list(context, imageType, formatList, flags)) return -1;
 
-    if( get_format_list( context, imageType, formatList, numFormats, flags ) )
-        return -1;
-
-    filterFlags = new bool[ numFormats ];
-    if( filterFlags == NULL )
-    {
-        log_error( "ERROR: Out of memory allocating filter flags list!\n" );
-        return -1;
-    }
-    memset( filterFlags, 0, sizeof( bool ) * numFormats );
-    filter_formats( formatList, filterFlags, numFormats, 0 );
+    std::vector<bool> filterFlags(formatList.size(), false);
+    filter_formats(formatList, filterFlags, nullptr);
 
     // Run the format list
-    for( unsigned int i = 0; i < numFormats; i++ )
+    for (unsigned int i = 0; i < formatList.size(); i++)
     {
         int test_return = 0;
         if( filterFlags[i] )
@@ -106,9 +96,6 @@ int test_image_type( cl_device_id device, cl_context context, cl_command_queue q
         ret += test_return;
     }
 
-    delete filterFlags;
-    delete formatList;
-
     return ret;
 }
 
diff --git a/test_conformance/images/kernel_read_write/test_iterations.cpp b/test_conformance/images/kernel_read_write/test_iterations.cpp
index 08a4fd28cb..41cf5d3e80 100644
--- a/test_conformance/images/kernel_read_write/test_iterations.cpp
+++ b/test_conformance/images/kernel_read_write/test_iterations.cpp
@@ -1562,8 +1562,11 @@ int test_read_image_2D( cl_context context, cl_command_queue queue, cl_kernel ke
     return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
 }
 
-int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                        bool floatCoords, ExplicitType outputType )
+int test_read_image_set_2D(cl_device_id device, cl_context context,
+                           cl_command_queue queue,
+                           const cl_image_format *format,
+                           image_sampler_data *imageSampler, bool floatCoords,
+                           ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_loops.cpp b/test_conformance/images/kernel_read_write/test_loops.cpp
index b1e0b7e432..795a9eda55 100644
--- a/test_conformance/images/kernel_read_write/test_loops.cpp
+++ b/test_conformance/images/kernel_read_write/test_loops.cpp
@@ -24,34 +24,34 @@ extern int gtestTypesToRun;
 
 extern int test_read_image_set_1D(cl_device_id device, cl_context context,
                                   cl_command_queue queue,
-                                  cl_image_format *format,
+                                  const cl_image_format *format,
                                   image_sampler_data *imageSampler,
                                   bool floatCoords, ExplicitType outputType);
 extern int test_read_image_set_2D(cl_device_id device, cl_context context,
                                   cl_command_queue queue,
-                                  cl_image_format *format,
+                                  const cl_image_format *format,
                                   image_sampler_data *imageSampler,
                                   bool floatCoords, ExplicitType outputType);
 extern int test_read_image_set_3D(cl_device_id device, cl_context context,
                                   cl_command_queue queue,
-                                  cl_image_format *format,
+                                  const cl_image_format *format,
                                   image_sampler_data *imageSampler,
                                   bool floatCoords, ExplicitType outputType);
 extern int test_read_image_set_1D_array(cl_device_id device, cl_context context,
                                         cl_command_queue queue,
-                                        cl_image_format *format,
+                                        const cl_image_format *format,
                                         image_sampler_data *imageSampler,
                                         bool floatCoords,
                                         ExplicitType outputType);
 extern int test_read_image_set_2D_array(cl_device_id device, cl_context context,
                                         cl_command_queue queue,
-                                        cl_image_format *format,
+                                        const cl_image_format *format,
                                         image_sampler_data *imageSampler,
                                         bool floatCoords,
                                         ExplicitType outputType);
 
 int test_read_image_type(cl_device_id device, cl_context context,
-                         cl_command_queue queue, cl_image_format *format,
+                         cl_command_queue queue, const cl_image_format *format,
                          bool floatCoords, image_sampler_data *imageSampler,
                          ExplicitType outputType, cl_mem_object_type imageType)
 {
@@ -164,8 +164,9 @@ int test_read_image_type(cl_device_id device, cl_context context,
 }
 
 int test_read_image_formats(cl_device_id device, cl_context context,
-                            cl_command_queue queue, cl_image_format *formatList,
-                            bool *filterFlags, unsigned int numFormats,
+                            cl_command_queue queue,
+                            const std::vector<cl_image_format> &formatList,
+                            const std::vector<bool> &filterFlags,
                             image_sampler_data *imageSampler,
                             ExplicitType outputType,
                             cl_mem_object_type imageType)
@@ -212,11 +213,11 @@ int test_read_image_formats(cl_device_id device, cl_context context,
                                              : "integer",
                      get_explicit_type_name(outputType));
 
-            for (unsigned int i = 0; i < numFormats; i++)
+            for (unsigned int i = 0; i < formatList.size(); i++)
             {
                 if (filterFlags[i]) continue;
 
-                cl_image_format &imageFormat = formatList[i];
+                const cl_image_format &imageFormat = formatList[i];
 
                 ret |=
                     test_read_image_type(device, context, queue, &imageFormat,
@@ -290,11 +291,6 @@ int test_image_set(cl_device_id device, cl_context context,
         }
     }
 
-    // Grab the list of supported image formats for integer reads
-    cl_image_format *formatList;
-    bool *filterFlags;
-    unsigned int numFormats;
-
     // This flag is only for querying the list of supported formats
     // The flag for creating image will be set explicitly in test functions
     cl_mem_flags flags;
@@ -326,19 +322,9 @@ int test_image_set(cl_device_id device, cl_context context,
         }
     }
 
-    if (get_format_list(context, imageType, formatList, numFormats, flags))
-        return -1;
-    BufferOwningPtr<cl_image_format> formatListBuf(formatList);
-
-
-    filterFlags = new bool[numFormats];
-    if (filterFlags == NULL)
-    {
-        log_error("ERROR: Out of memory allocating filter flags list!\n");
-        return -1;
-    }
-    BufferOwningPtr<bool> filterFlagsBuf(filterFlags);
-    memset(filterFlags, 0, sizeof(bool) * numFormats);
+    // Grab the list of supported image formats for integer reads
+    std::vector<cl_image_format> formatList;
+    if (get_format_list(context, imageType, formatList, flags)) return -1;
 
     // First time through, we'll go ahead and print the formats supported,
     // regardless of type
@@ -348,7 +334,7 @@ int test_image_set(cl_device_id device, cl_context context,
     {
         log_info("---- Supported %s %s formats for this device ---- \n",
                  convert_image_type_to_string(imageType), flagNames);
-        for (unsigned int f = 0; f < numFormats; f++)
+        for (unsigned int f = 0; f < formatList.size(); f++)
         {
             if (IsChannelOrderSupported(formatList[f].image_channel_order)
                 && IsChannelTypeSupported(
@@ -369,8 +355,9 @@ int test_image_set(cl_device_id device, cl_context context,
     {
         if (gTypesToTest & test.type)
         {
-            if (filter_formats(formatList, filterFlags, numFormats,
-                               test.channelTypes, gTestMipmaps)
+            std::vector<bool> filterFlags(formatList.size(), false);
+            if (filter_formats(formatList, filterFlags, test.channelTypes,
+                               gTestMipmaps)
                 == 0)
             {
                 log_info("No formats supported for %s type\n", test.name);
@@ -379,7 +366,7 @@ int test_image_set(cl_device_id device, cl_context context,
             {
                 imageSampler.filter_mode = CL_FILTER_NEAREST;
                 ret += formatTestFn(device, context, queue, formatList,
-                                    filterFlags, numFormats, &imageSampler,
+                                    filterFlags, &imageSampler,
                                     test.explicitType, imageType);
 
                 // Linear filtering is only supported with floats
@@ -387,7 +374,7 @@ int test_image_set(cl_device_id device, cl_context context,
                 {
                     imageSampler.filter_mode = CL_FILTER_LINEAR;
                     ret += formatTestFn(device, context, queue, formatList,
-                                        filterFlags, numFormats, &imageSampler,
+                                        filterFlags, &imageSampler,
                                         test.explicitType, imageType);
                 }
             }
diff --git a/test_conformance/images/kernel_read_write/test_read_1D.cpp b/test_conformance/images/kernel_read_write/test_read_1D.cpp
index ed387532ea..606d74fa7e 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D.cpp
@@ -985,8 +985,11 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke
     return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
 }
 
-int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                        bool floatCoords, ExplicitType outputType )
+int test_read_image_set_1D(cl_device_id device, cl_context context,
+                           cl_command_queue queue,
+                           const cl_image_format *format,
+                           image_sampler_data *imageSampler, bool floatCoords,
+                           ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
index 1ffd598015..f2b723ad82 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
@@ -1094,8 +1094,11 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker
     return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
 }
 
-int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                                 bool floatCoords, ExplicitType outputType )
+int test_read_image_set_1D_array(cl_device_id device, cl_context context,
+                                 cl_command_queue queue,
+                                 const cl_image_format *format,
+                                 image_sampler_data *imageSampler,
+                                 bool floatCoords, ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
index 558b126693..55c03d75a8 100644
--- a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
@@ -1301,8 +1301,11 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker
     return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
 }
 
-int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                           bool floatCoords, ExplicitType outputType )
+int test_read_image_set_2D_array(cl_device_id device, cl_context context,
+                                 cl_command_queue queue,
+                                 const cl_image_format *format,
+                                 image_sampler_data *imageSampler,
+                                 bool floatCoords, ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_read_3D.cpp b/test_conformance/images/kernel_read_write/test_read_3D.cpp
index 0fd777f9fe..7b598132ed 100644
--- a/test_conformance/images/kernel_read_write/test_read_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_3D.cpp
@@ -1158,8 +1158,11 @@ int test_read_image_3D( cl_context context, cl_command_queue queue, cl_kernel ke
     return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
 }
 
-int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                           bool floatCoords, ExplicitType outputType )
+int test_read_image_set_3D(cl_device_id device, cl_context context,
+                           cl_command_queue queue,
+                           const cl_image_format *format,
+                           image_sampler_data *imageSampler, bool floatCoords,
+                           ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_write_1D.cpp b/test_conformance/images/kernel_read_write/test_write_1D.cpp
index bae88b23cc..41983edf75 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D.cpp
@@ -521,7 +521,10 @@ int test_write_image_1D( cl_device_id device, cl_context context, cl_command_que
     return totalErrors;
 }
 
-int test_write_image_1D_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d )
+int test_write_image_1D_set(cl_device_id device, cl_context context,
+                            cl_command_queue queue,
+                            const cl_image_format *format,
+                            ExplicitType inputType, MTdata d)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
index 6242797aef..c771704cad 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
@@ -542,7 +542,10 @@ int test_write_image_1D_array( cl_device_id device, cl_context context, cl_comma
 }
 
 
-int test_write_image_1D_array_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d )
+int test_write_image_1D_array_set(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  const cl_image_format *format,
+                                  ExplicitType inputType, MTdata d)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
index b0dc1cc3e4..08a7a80334 100644
--- a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
@@ -568,7 +568,10 @@ int test_write_image_2D_array( cl_device_id device, cl_context context, cl_comma
 }
 
 
-int test_write_image_2D_array_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d )
+int test_write_image_2D_array_set(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  const cl_image_format *format,
+                                  ExplicitType inputType, MTdata d)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_write_3D.cpp b/test_conformance/images/kernel_read_write/test_write_3D.cpp
index 7df29ca489..5cc96bb4b0 100644
--- a/test_conformance/images/kernel_read_write/test_write_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_3D.cpp
@@ -576,7 +576,10 @@ int test_write_image_3D( cl_device_id device, cl_context context, cl_command_que
 }
 
 
-int test_write_image_3D_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d )
+int test_write_image_3D_set(cl_device_id device, cl_context context,
+                            cl_command_queue queue,
+                            const cl_image_format *format,
+                            ExplicitType inputType, MTdata d)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/kernel_read_write/test_write_image.cpp b/test_conformance/images/kernel_read_write/test_write_image.cpp
index 2beaf40298..e40e80d61f 100644
--- a/test_conformance/images/kernel_read_write/test_write_image.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_image.cpp
@@ -23,10 +23,24 @@ extern bool gTestImage2DFromBuffer;
 extern cl_mem_flags gMemFlagsToUse;
 extern int gtestTypesToRun;
 
-extern int test_write_image_1D_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d );
-extern int test_write_image_3D_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d );
-extern int test_write_image_1D_array_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d );
-extern int test_write_image_2D_array_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d );
+extern int test_write_image_1D_set(cl_device_id device, cl_context context,
+                                   cl_command_queue queue,
+                                   const cl_image_format *format,
+                                   ExplicitType inputType, MTdata d);
+extern int test_write_image_3D_set(cl_device_id device, cl_context context,
+                                   cl_command_queue queue,
+                                   const cl_image_format *format,
+                                   ExplicitType inputType, MTdata d);
+extern int test_write_image_1D_array_set(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         const cl_image_format *format,
+                                         ExplicitType inputType, MTdata d);
+extern int test_write_image_2D_array_set(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         const cl_image_format *format,
+                                         ExplicitType inputType, MTdata d);
 
 extern bool validate_float_write_results( float *expected, float *actual, image_descriptor *imageInfo );
 extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo );
@@ -595,7 +609,9 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue
 }
 
 
-int test_write_image_set( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, ExplicitType inputType, MTdata d )
+int test_write_image_set(cl_device_id device, cl_context context,
+                         cl_command_queue queue, const cl_image_format *format,
+                         ExplicitType inputType, MTdata d)
 {
     char programSrc[10240];
     const char *ptr;
@@ -797,8 +813,13 @@ int test_write_image_set( cl_device_id device, cl_context context, cl_command_qu
     return 0;
 }
 
-int test_write_image_formats( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *formatList, bool *filterFlags, unsigned int numFormats,
-                             image_sampler_data *imageSampler, ExplicitType inputType, cl_mem_object_type imageType )
+int test_write_image_formats(cl_device_id device, cl_context context,
+                             cl_command_queue queue,
+                             const std::vector<cl_image_format> &formatList,
+                             const std::vector<bool> &filterFlags,
+                             image_sampler_data *imageSampler,
+                             ExplicitType inputType,
+                             cl_mem_object_type imageType)
 {
     if( imageSampler->filter_mode == CL_FILTER_LINEAR )
         // No need to run for linear filters
@@ -811,9 +832,9 @@ int test_write_image_formats( cl_device_id device, cl_context context, cl_comman
 
     RandomSeed seed( gRandomSeed );
 
-    for( unsigned int i = 0; i < numFormats; i++ )
+    for (unsigned int i = 0; i < formatList.size(); i++)
     {
-        cl_image_format &imageFormat = formatList[ i ];
+        const cl_image_format &imageFormat = formatList[i];
 
         if( filterFlags[ i ] )
             continue;
diff --git a/test_conformance/images/samplerlessReads/test_iterations.cpp b/test_conformance/images/samplerlessReads/test_iterations.cpp
index f6bb42afd4..55eaaf48c8 100644
--- a/test_conformance/images/samplerlessReads/test_iterations.cpp
+++ b/test_conformance/images/samplerlessReads/test_iterations.cpp
@@ -176,8 +176,11 @@ int test_read_image_2D( cl_context context, cl_command_queue queue, cl_kernel ke
     return 0;
 }
 
-int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                            ExplicitType outputType )
+int test_read_image_set_2D(cl_device_id device, cl_context context,
+                           cl_command_queue queue,
+                           const cl_image_format *format,
+                           image_sampler_data *imageSampler,
+                           ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/samplerlessReads/test_loops.cpp b/test_conformance/images/samplerlessReads/test_loops.cpp
index 6f5d009b80..db49a8f62f 100644
--- a/test_conformance/images/samplerlessReads/test_loops.cpp
+++ b/test_conformance/images/samplerlessReads/test_loops.cpp
@@ -19,15 +19,42 @@
 extern int gTypesToTest;
 extern bool gTestReadWrite;
 
-extern int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler, ExplicitType outputType );
-extern int test_read_image_set_1D_buffer( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler, ExplicitType outputType );
-extern int test_read_image_set_2D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler, ExplicitType outputType );
-extern int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler, ExplicitType outputType );
-extern int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler, ExplicitType outputType );
-extern int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler, ExplicitType outputType );
-
-int test_read_image_type( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format,
-                          image_sampler_data *imageSampler, ExplicitType outputType, cl_mem_object_type imageType )
+extern int test_read_image_set_1D(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  const cl_image_format *format,
+                                  image_sampler_data *imageSampler,
+                                  ExplicitType outputType);
+extern int test_read_image_set_1D_buffer(cl_device_id device,
+                                         cl_context context,
+                                         cl_command_queue queue,
+                                         const cl_image_format *format,
+                                         image_sampler_data *imageSampler,
+                                         ExplicitType outputType);
+extern int test_read_image_set_2D(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  const cl_image_format *format,
+                                  image_sampler_data *imageSampler,
+                                  ExplicitType outputType);
+extern int test_read_image_set_3D(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  const cl_image_format *format,
+                                  image_sampler_data *imageSampler,
+                                  ExplicitType outputType);
+extern int test_read_image_set_1D_array(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        const cl_image_format *format,
+                                        image_sampler_data *imageSampler,
+                                        ExplicitType outputType);
+extern int test_read_image_set_2D_array(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        const cl_image_format *format,
+                                        image_sampler_data *imageSampler,
+                                        ExplicitType outputType);
+
+int test_read_image_type(cl_device_id device, cl_context context,
+                         cl_command_queue queue, const cl_image_format *format,
+                         image_sampler_data *imageSampler,
+                         ExplicitType outputType, cl_mem_object_type imageType)
 {
     int ret = 0;
     imageSampler->addressing_mode = CL_ADDRESS_NONE;
@@ -68,20 +95,25 @@ int test_read_image_type( cl_device_id device, cl_context context, cl_command_qu
     return ret;
 }
 
-int test_read_image_formats( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *formatList, bool *filterFlags, unsigned int numFormats,
-                             image_sampler_data *imageSampler, ExplicitType outputType, cl_mem_object_type imageType )
+int test_read_image_formats(cl_device_id device, cl_context context,
+                            cl_command_queue queue,
+                            const std::vector<cl_image_format> &formatList,
+                            const std::vector<bool> &filterFlags,
+                            image_sampler_data *imageSampler,
+                            ExplicitType outputType,
+                            cl_mem_object_type imageType)
 {
     int ret = 0;
     imageSampler->normalized_coords = false;
     log_info( "read_image (%s coords, %s results) *****************************\n",
               "integer", get_explicit_type_name( outputType ) );
 
-    for ( unsigned int i = 0; i < numFormats; i++ )
+    for (unsigned int i = 0; i < formatList.size(); i++)
     {
         if ( filterFlags[i] )
             continue;
 
-        cl_image_format &imageFormat = formatList[ i ];
+        const cl_image_format &imageFormat = formatList[i];
 
         ret |= test_read_image_type( device, context, queue, &imageFormat, imageSampler, outputType, imageType );
     }
@@ -95,78 +127,50 @@ int test_image_set( cl_device_id device, cl_context context, cl_command_queue qu
     static int printedFormatList = -1;
 
     // Grab the list of supported image formats
-    cl_image_format *formatList;
-    unsigned int numFormats;
+    std::vector<cl_image_format> formatList;
 
     if (gTestReadWrite && checkForReadWriteImageSupport(device))
     {
         return TEST_SKIPPED_ITSELF;
     }
 
-    cl_image_format *readOnlyFormats;
-    unsigned int numReadOnlyFormats;
-
-    if (get_format_list(context, imageType, readOnlyFormats, numReadOnlyFormats,
-                        CL_MEM_READ_ONLY))
+    std::vector<cl_image_format> readOnlyFormats;
+    if (get_format_list(context, imageType, readOnlyFormats, CL_MEM_READ_ONLY))
         return -1;
 
     if (gTestReadWrite)
     {
-        cl_image_format *readWriteFormats;
-        unsigned int numReadWriteFormats;
-
+        std::vector<cl_image_format> readWriteFormats;
         if (get_format_list(context, imageType, readWriteFormats,
-                            numReadWriteFormats, CL_MEM_KERNEL_READ_AND_WRITE))
+                            CL_MEM_KERNEL_READ_AND_WRITE))
             return -1;
 
-        numFormats = numReadOnlyFormats;
-        formatList = new cl_image_format[numFormats];
-        unsigned int k = 0;
-
         // Keep only intersecting formats with read only and read write flags
-        for (unsigned int i = 0; i < numReadOnlyFormats; i++)
+        for (unsigned int i = 0; i < readOnlyFormats.size(); i++)
         {
-            for (unsigned int j = 0; j < numReadWriteFormats; j++)
+            for (unsigned int j = 0; j < readWriteFormats.size(); j++)
             {
                 if (readOnlyFormats[i].image_channel_data_type
                         == readWriteFormats[j].image_channel_data_type
                     && readOnlyFormats[i].image_channel_order
                         == readWriteFormats[j].image_channel_order)
                 {
-                    formatList[k].image_channel_data_type =
-                        readOnlyFormats[i].image_channel_data_type;
-                    formatList[k].image_channel_order =
-                        readOnlyFormats[i].image_channel_order;
-                    k++;
+                    formatList.push_back(readOnlyFormats[i]);
                     break;
                 }
             }
         }
-
-        numFormats = k;
-
-        delete[] readOnlyFormats;
-        delete[] readWriteFormats;
     }
     else
     {
-        numFormats = numReadOnlyFormats;
         formatList = readOnlyFormats;
     }
 
-    bool *filterFlags = new bool[numFormats];
-    if ( filterFlags == NULL )
-    {
-        log_error( "ERROR: Out of memory allocating filter flags list!\n" );
-        return -1;
-    }
-    memset( filterFlags, 0, sizeof( bool ) * numFormats );
-
     // First time through, we'll go ahead and print the formats supported, regardless of type
     if ( printedFormatList != (int)imageType )
     {
         log_info( "---- Supported %s read formats for this device ---- \n", convert_image_type_to_string(imageType) );
-        for ( unsigned int f = 0; f < numFormats; f++ )
+        for (unsigned int f = 0; f < formatList.size(); f++)
             log_info( "  %-7s %-24s %d\n", GetChannelOrderName( formatList[ f ].image_channel_order ),
                       GetChannelTypeName( formatList[ f ].image_channel_data_type ),
                       (int)get_format_channel_count( &formatList[ f ] ) );
@@ -180,9 +184,8 @@ int test_image_set( cl_device_id device, cl_context context, cl_command_queue qu
     {
         if (gTypesToTest & test.type)
         {
-            if (filter_formats(formatList, filterFlags, numFormats,
-                               test.channelTypes)
-                == 0)
+            std::vector<bool> filterFlags(formatList.size(), false);
+            if (filter_formats(formatList, filterFlags, test.channelTypes) == 0)
             {
                 log_info("No formats supported for %s type\n", test.name);
             }
@@ -190,14 +193,11 @@ int test_image_set( cl_device_id device, cl_context context, cl_command_queue qu
             {
                 imageSampler.filter_mode = CL_FILTER_NEAREST;
                 ret += test_read_image_formats(
-                    device, context, queue, formatList, filterFlags, numFormats,
+                    device, context, queue, formatList, filterFlags,
                     &imageSampler, test.explicitType, imageType);
             }
         }
     }
 
-    delete[] filterFlags;
-    delete[] formatList;
-
     return ret;
 }
diff --git a/test_conformance/images/samplerlessReads/test_read_1D.cpp b/test_conformance/images/samplerlessReads/test_read_1D.cpp
index a55d2be5d1..aa261b7ead 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D.cpp
@@ -177,8 +177,11 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke
     return 0;
 }
 
-int test_read_image_set_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                            ExplicitType outputType )
+int test_read_image_set_1D(cl_device_id device, cl_context context,
+                           cl_command_queue queue,
+                           const cl_image_format *format,
+                           image_sampler_data *imageSampler,
+                           ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
index b63fed4f0c..fb0c26326b 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp
@@ -175,8 +175,11 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker
     return 0;
 }
 
-int test_read_image_set_1D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                            ExplicitType outputType )
+int test_read_image_set_1D_array(cl_device_id device, cl_context context,
+                                 cl_command_queue queue,
+                                 const cl_image_format *format,
+                                 image_sampler_data *imageSampler,
+                                 ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
index ee48ec8407..7a3084d3ab 100644
--- a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp
@@ -160,8 +160,11 @@ int test_read_image_1D_buffer( cl_context context, cl_command_queue queue, cl_ke
     return 0;
 }
 
-int test_read_image_set_1D_buffer( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format, image_sampler_data *imageSampler,
-                            ExplicitType outputType )
+int test_read_image_set_1D_buffer(cl_device_id device, cl_context context,
+                                  cl_command_queue queue,
+                                  const cl_image_format *format,
+                                  image_sampler_data *imageSampler,
+                                  ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
index 95a973c401..99f24266d4 100644
--- a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp
@@ -161,8 +161,11 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker
     return 0;
 }
 
-int test_read_image_set_2D_array( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format,
-                                  image_sampler_data *imageSampler, ExplicitType outputType )
+int test_read_image_set_2D_array(cl_device_id device, cl_context context,
+                                 cl_command_queue queue,
+                                 const cl_image_format *format,
+                                 image_sampler_data *imageSampler,
+                                 ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/samplerlessReads/test_read_3D.cpp b/test_conformance/images/samplerlessReads/test_read_3D.cpp
index 39ca71aedc..cf4114074f 100644
--- a/test_conformance/images/samplerlessReads/test_read_3D.cpp
+++ b/test_conformance/images/samplerlessReads/test_read_3D.cpp
@@ -164,8 +164,11 @@ int test_read_image_3D( cl_context context, cl_command_queue queue, cl_kernel ke
     return 0;
 }
 
-int test_read_image_set_3D( cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format,
-                            image_sampler_data *imageSampler, ExplicitType outputType )
+int test_read_image_set_3D(cl_device_id device, cl_context context,
+                           cl_command_queue queue,
+                           const cl_image_format *format,
+                           image_sampler_data *imageSampler,
+                           ExplicitType outputType)
 {
     char programSrc[10240];
     const char *ptr;
diff --git a/test_conformance/images/testBase.h b/test_conformance/images/testBase.h
index 7c45fdd109..ad48f10d5c 100644
--- a/test_conformance/images/testBase.h
+++ b/test_conformance/images/testBase.h
@@ -64,19 +64,22 @@ enum TestTypes
     kAllTests = ( kReadTests | kWriteTests | kReadWriteTests )
 };
 
-typedef int (*test_format_set_fn)( cl_device_id device, cl_context context, cl_command_queue queue,
-  cl_image_format *formatList, bool *filterFlags, unsigned int numFormats,
-  image_sampler_data *imageSampler, ExplicitType outputType,
-  cl_mem_object_type imageType );
+typedef int (*test_format_set_fn)(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    const std::vector<cl_image_format> &formatList,
+    const std::vector<bool> &filterFlags, image_sampler_data *imageSampler,
+    ExplicitType outputType, cl_mem_object_type imageType);
 
-extern int test_read_image_formats( cl_device_id device, cl_context context, cl_command_queue queue,
-  cl_image_format *formatList, bool *filterFlags, unsigned int numFormats,
-  image_sampler_data *imageSampler, ExplicitType outputType,
-  cl_mem_object_type imageType );
-extern int test_write_image_formats( cl_device_id device, cl_context context, cl_command_queue queue,
-  cl_image_format *formatList, bool *filterFlags, unsigned int numFormats,
-  image_sampler_data *imageSampler, ExplicitType outputType,
-  cl_mem_object_type imageType );
+extern int test_read_image_formats(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    const std::vector<cl_image_format> &formatList,
+    const std::vector<bool> &filterFlags, image_sampler_data *imageSampler,
+    ExplicitType outputType, cl_mem_object_type imageType);
+extern int test_write_image_formats(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    const std::vector<cl_image_format> &formatList,
+    const std::vector<bool> &filterFlags, image_sampler_data *imageSampler,
+    ExplicitType outputType, cl_mem_object_type imageType);
 
 #endif // _testBase_h
 

From 0d74b3f926e547b2ae6d13de55b16aa5a6711265 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Fri, 15 Jan 2021 17:31:03 +0100
Subject: [PATCH 019/158] Test api fix - sampler properties is 0 in
 compatibility mode (#1106)

---
 test_conformance/api/test_queries.cpp | 42 ++++++++++++++++++---------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/test_conformance/api/test_queries.cpp b/test_conformance/api/test_queries.cpp
index 91678a20df..469a19349a 100644
--- a/test_conformance/api/test_queries.cpp
+++ b/test_conformance/api/test_queries.cpp
@@ -231,23 +231,39 @@ int test_sampler_params(cl_device_id deviceID, cl_context context,
             error,
             "clGetSamplerInfo failed asking for CL_SAMPLER_PROPERTIES size.");
 
-        if (set_size != test_properties.size() * sizeof(cl_sampler_properties))
+        if (is_compatibility)
         {
-            log_error("ERROR: CL_SAMPLER_PROPERTIES size is %d, expected %d.\n",
-                      set_size,
-                      test_properties.size() * sizeof(cl_sampler_properties));
-            return TEST_FAIL;
+            if (set_size != 0)
+            {
+                log_error(
+                    "ERROR: CL_SAMPLER_PROPERTIES size is %d, expected 0\n",
+                    set_size);
+                return TEST_FAIL;
+            }
         }
+        else
+        {
+            if (set_size
+                != test_properties.size() * sizeof(cl_sampler_properties))
+            {
+                log_error(
+                    "ERROR: CL_SAMPLER_PROPERTIES size is %d, expected %d.\n",
+                    set_size,
+                    test_properties.size() * sizeof(cl_sampler_properties));
+                return TEST_FAIL;
+            }
 
-        cl_uint number_of_props = set_size / sizeof(cl_sampler_properties);
-        check_properties.resize(number_of_props);
-        error = clGetSamplerInfo(sampler, CL_SAMPLER_PROPERTIES, set_size,
-                                 check_properties.data(), 0);
-        test_error(error,
-                   "clGetSamplerInfo failed asking for CL_SAMPLER_PROPERTIES.");
+            cl_uint number_of_props = set_size / sizeof(cl_sampler_properties);
+            check_properties.resize(number_of_props);
+            error = clGetSamplerInfo(sampler, CL_SAMPLER_PROPERTIES, set_size,
+                                     check_properties.data(), 0);
+            test_error(
+                error,
+                "clGetSamplerInfo failed asking for CL_SAMPLER_PROPERTIES.");
 
-        error = compareProperties(check_properties, test_properties);
-        test_error(error, "checkProperties mismatch.");
+            error = compareProperties(check_properties, test_properties);
+            test_error(error, "checkProperties mismatch.");
+        }
     }
     return 0;
 }

From af6d55d68c677a9fa1b009e833f503b8bf1f5b95 Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Tue, 19 Jan 2021 11:35:30 -0500
Subject: [PATCH 020/158] Use delete[] to deallocate after new[] (#1107)

* Use delete[] to deallocate after new[]

* Fixup formatting with git-clang-format
---
 .../mem_host_flags/C_host_memory_block.h      | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/test_conformance/mem_host_flags/C_host_memory_block.h b/test_conformance/mem_host_flags/C_host_memory_block.h
index 91b47abfce..1d3b47570e 100644
--- a/test_conformance/mem_host_flags/C_host_memory_block.h
+++ b/test_conformance/mem_host_flags/C_host_memory_block.h
@@ -69,31 +69,28 @@ C_host_memory_block<T>::C_host_memory_block()
 template < class T>
 C_host_memory_block<T>::~C_host_memory_block()
 {
-  if (pData!=NULL) delete pData;
-  num_elements = 0;
+    if (pData != NULL) delete[] pData;
+    num_elements = 0;
 }
 
 template < class T >
 void C_host_memory_block<T>::Init(int num_elem, T & value)
 {
-  if (pData!=NULL) delete pData;
-  pData= new T [num_elem];
-  for (int i=0; i<num_elem; i++)
-    pData[i] = value;
+    if (pData != NULL) delete[] pData;
+    pData = new T[num_elem];
+    for (int i = 0; i < num_elem; i++) pData[i] = value;
 
-  num_elements= num_elem;
+    num_elements = num_elem;
 }
 
 template < class T >
 void C_host_memory_block<T>::Init(int num_elem)
 {
-  if (pData!=NULL) delete pData;
-  pData = new T [num_elem];
-  for (int i=0; i<num_elem; i++)
-    pData[i]= (T) i;
-
-  num_elements = num_elem;
+    if (pData != NULL) delete[] pData;
+    pData = new T[num_elem];
+    for (int i = 0; i < num_elem; i++) pData[i] = (T)i;
 
+    num_elements = num_elem;
 }
 template < class T >
 void  C_host_memory_block<T>::Set_to_zero()

From be93630330194886fed8057cae01cb725f29261d Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 20 Jan 2021 15:01:59 +0000
Subject: [PATCH 021/158] Remove dead code in math_brute_force (#1117)

* Remove dead code

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Remove tautological statements

PARALLEL_REFERENCE is unconditionally defined. Remove preprocessor
condition that always hold.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Remove unnecessary declarations

Also removed unused macro.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Format code

An unnecessary scope was removed. This formats the code using
clang-format.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_two_results_i.cpp | 300 +++++-----
 test_conformance/math_brute_force/i_unary.cpp |  10 -
 test_conformance/math_brute_force/mad.cpp     | 514 +-----------------
 .../math_brute_force/reference_math.cpp       | 287 +---------
 test_conformance/math_brute_force/ternary.cpp |  10 -
 .../math_brute_force/unary_two_results.cpp    |  10 -
 .../math_brute_force/unary_two_results_i.cpp  |  10 -
 test_conformance/math_brute_force/unary_u.cpp |  10 -
 8 files changed, 132 insertions(+), 1019 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index 5065b280df..1130e93cb3 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -19,8 +19,6 @@
 #include <string.h>
 #include "FunctionList.h"
 
-#define PARALLEL_REFERENCE
-
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
@@ -248,7 +246,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-#if defined PARALLEL_REFERENCE
 typedef struct ComputeReferenceInfoF_
 {
     const float *x;
@@ -315,8 +312,6 @@ static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     return CL_SUCCESS;
 }
 
-#endif
-
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
@@ -333,9 +328,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
-#if defined PARALLEL_REFERENCE
     cl_uint threadCount = GetThreadCount();
-#endif
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     if (gIsEmbedded)
@@ -354,11 +347,6 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                                    &build_info)))
             return error;
     }
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
-       programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -457,7 +445,6 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
 
-#if defined PARALLEL_REFERENCE
         if (threadCount > 1)
         {
             ComputeReferenceInfoF cri;
@@ -472,14 +459,11 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-#endif
             float *r = (float *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
             for (j = 0; j < bufferSize / sizeof(float); j++)
                 r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
-#if defined PARALLEL_REFERENCE
         }
-#endif
 
         // Read the data back
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
@@ -848,9 +832,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
-#if defined PARALLEL_REFERENCE
     cl_uint threadCount = GetThreadCount();
-#endif
 
     Force64BitFPUPrecision();
 
@@ -867,11 +849,6 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
     }
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
-       i, programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -970,7 +947,6 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
 
-#if defined PARALLEL_REFERENCE
         if (threadCount > 1)
         {
             ComputeReferenceInfoD cri;
@@ -985,14 +961,11 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-#endif
             double *r = (double *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
             for (j = 0; j < bufferSize / sizeof(double); j++)
                 r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
-#if defined PARALLEL_REFERENCE
         }
-#endif
 
         // Read the data back
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
@@ -1034,180 +1007,164 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
                     && t2[j] == q2[j])
                     continue;
 
-                // if( t[j] != q[j] || t2[j] != q2[j] )
+                double test = ((double *)q)[j];
+                int correct2 = INT_MIN;
+                long double correct = f->dfunc.f_ffpI(s[j], s2[j], &correct2);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
+                    || ((double *)gIn2)[j] == 0.0 || isnan(((double *)gIn2)[j])
+                    || isnan(((double *)gIn)[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
+                if (ftz && fail)
                 {
-                    double test = ((double *)q)[j];
-                    int correct2 = INT_MIN;
-                    long double correct =
-                        f->dfunc.f_ffpI(s[j], s2[j], &correct2);
-                    float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    int64_t iErr;
-
-                    // in case of remquo, we only care about the sign and last
-                    // seven bits of integer as per the spec.
-                    if (testingRemquo)
-                        iErr = (long long)(q2[j] & 0x0000007f)
-                            - (long long)(correct2 & 0x0000007f);
-                    else
-                        iErr = (long long)q2[j] - (long long)correct2;
-
-                    // For remquo, if y = 0, x is infinite, or either is NaN
-                    // then the standard either neglects to say what is returned
-                    // in iptr or leaves it undefined or implementation defined.
-                    int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
-                        || ((double *)gIn2)[j] == 0.0
-                        || isnan(((double *)gIn2)[j])
-                        || isnan(((double *)gIn)[j]);
-                    if (iptrUndefined) iErr = 0;
+                    // retry per section 6.5.3.2
+                    if (IsDoubleResultSubnormal(correct, f->double_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
 
-                    int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
-                    if (ftz && fail)
+                    // retry per section 6.5.3.3
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
+                        int correct3i, correct4i;
+                        long double correct3 =
+                            f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
+                        long double correct4 =
+                            f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= f->double_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
+                            || IsDoubleResultSubnormal(correct3,
+                                                       f->double_ulps))
                         {
-                            fail = fail && !(test == 0.0f && iErr == 0);
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
                             if (!fail) err = 0.0f;
                         }
 
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
+                        // try with both args as zero
+                        if (IsDoubleSubnormal(s2[j]))
                         {
-                            int correct3i, correct4i;
-                            long double correct3 =
-                                f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
-                            long double correct4 =
-                                f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct4);
-                            int64_t iErr3 =
-                                (long long)q2[j] - (long long)correct3i;
-                            int64_t iErr4 =
-                                (long long)q2[j] - (long long)correct4i;
+                            int correct7i, correct8i;
+                            correct3 = f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
+                            long double correct7 =
+                                f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
+                            long double correct8 =
+                                f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Bruteforce_Ulp_Error_Double(test, correct3);
+                            err3 = Bruteforce_Ulp_Error_Double(test, correct4);
+                            float err4 =
+                                Bruteforce_Ulp_Error_Double(test, correct7);
+                            float err5 =
+                                Bruteforce_Ulp_Error_Double(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
                             fail = fail
                                 && ((!(fabsf(err2) <= f->double_ulps
                                        && iErr3 == 0))
                                     && (!(fabsf(err3) <= f->double_ulps
-                                          && iErr4 == 0)));
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= f->double_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= f->double_ulps
+                                          && iErr8 == 0)));
                             if (fabsf(err2) < fabsf(err)) err = err2;
                             if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
                             if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
                             if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
 
                             // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
+                            if (IsDoubleResultSubnormal(correct3,
                                                         f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
+                                || IsDoubleResultSubnormal(correct4,
+                                                           f->double_ulps)
+                                || IsDoubleResultSubnormal(correct7,
+                                                           f->double_ulps)
+                                || IsDoubleResultSubnormal(correct8,
                                                            f->double_ulps))
                             {
                                 fail = fail
                                     && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0));
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
                                 if (!fail) err = 0.0f;
                             }
-
-                            // try with both args as zero
-                            if (IsDoubleSubnormal(s2[j]))
-                            {
-                                int correct7i, correct8i;
-                                correct3 =
-                                    f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
-                                correct4 =
-                                    f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
-                                long double correct7 =
-                                    f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
-                                long double correct8 =
-                                    f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct7);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct8);
-                                iErr3 = (long long)q2[j] - (long long)correct3i;
-                                iErr4 = (long long)q2[j] - (long long)correct4i;
-                                int64_t iErr7 =
-                                    (long long)q2[j] - (long long)correct7i;
-                                int64_t iErr8 =
-                                    (long long)q2[j] - (long long)correct8i;
-                                fail = fail
-                                    && ((!(fabsf(err2) <= f->double_ulps
-                                           && iErr3 == 0))
-                                        && (!(fabsf(err3) <= f->double_ulps
-                                              && iErr4 == 0))
-                                        && (!(fabsf(err4) <= f->double_ulps
-                                              && iErr7 == 0))
-                                        && (!(fabsf(err5) <= f->double_ulps
-                                              && iErr8 == 0)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-                                if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                                if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-                                if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
-                                if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct3,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct4,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct7,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct8,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail
-                                        && !(test == 0.0f
-                                             && (iErr3 == 0 || iErr4 == 0
-                                                 || iErr7 == 0 || iErr8 == 0));
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
                         }
-                        else if (IsDoubleSubnormal(s2[j]))
+                    }
+                    else if (IsDoubleSubnormal(s2[j]))
+                    {
+                        int correct3i, correct4i;
+                        long double correct3 =
+                            f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
+                        long double correct4 =
+                            f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= f->double_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
+                            || IsDoubleResultSubnormal(correct3,
+                                                       f->double_ulps))
                         {
-                            int correct3i, correct4i;
-                            long double correct3 =
-                                f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
-                            long double correct4 =
-                                f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct4);
-                            int64_t iErr3 =
-                                (long long)q2[j] - (long long)correct3i;
-                            int64_t iErr4 =
-                                (long long)q2[j] - (long long)correct4i;
                             fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps
-                                       && iErr3 == 0))
-                                    && (!(fabsf(err3) <= f->double_ulps
-                                          && iErr4 == 0)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
-                                                           f->double_ulps))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0));
-                                if (!fail) err = 0.0f;
-                            }
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
                         }
                     }
+                }
                     if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
@@ -1236,7 +1193,6 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
                         error = -1;
                         goto exit;
                     }
-                }
             }
         }
 
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index 7e2073793a..5b8b42439a 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -248,11 +248,6 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
         return error;
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
-       programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -502,11 +497,6 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         return error;
     }
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
-       i, programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index 9292649aa3..a510e5b118 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -248,8 +248,6 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-    //    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM &
-    //    gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
@@ -263,11 +261,6 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
         return error;
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
-       programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -380,247 +373,8 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
 
         if (gSkipCorrectnessTesting) break;
 
-        // Verify data  -- Commented out on purpose. no verification possible.
+        // Verify data -- No verification possible.
         // MAD is a random number generator.
-        /*
-                uint32_t *t = gOut_Ref;
-                for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                {
-                    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
-                    {
-                        uint32_t *q = gOut[k];
-
-                        // If we aren't getting the correctly rounded result
-                        if( t[j] != q[j] )
-                        {
-                            float test = ((float*) q)[j];
-                            double correct = f->func.f_fff( s[j], s2[j], s3[j]
-         ); float err = Ulp_Error( test, correct ); int fail = ! (fabsf(err) <=
-         f->float_ulps);
-
-                            if( fail && ftz )
-                            {
-                                // retry per section 6.5.3.2
-                                if( IsFloatSubnormal(correct) )
-                                { // look at me,
-                                    fail = fail && ( test != 0.0f );
-                                    if( ! fail )
-                                        err = 0.0f;
-                                }
-
-                                // retry per section 6.5.3.3
-                                if( fail && IsFloatSubnormal( s[j] ) )
-                                { // look at me,
-                                    double correct2 = f->func.f_fff( 0.0, s2[j],
-         s3[j] ); double correct3 = f->func.f_fff( -0.0, s2[j], s3[j] ); float
-         err2 = Ulp_Error( test, correct2  ); float err3 = Ulp_Error( test,
-         correct3  ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) &&
-         (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-
-                                    // retry per section 6.5.3.4
-                                    if( IsFloatResultSubnormal(correct2,
-         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                                    { // look at me now,
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
-                                    }
-
-                                    //try with first two args as zero
-                                    if( IsFloatSubnormal( s2[j] ) )
-                                    { // its fun to have fun,
-                                        correct2 = f->func.f_fff( 0.0, 0.0,
-         s3[j] ); correct3 = f->func.f_fff( -0.0, 0.0, s3[j] ); double correct4
-         = f->func.f_fff( 0.0, -0.0, s3[j] ); double correct5 = f->func.f_fff(
-         -0.0, -0.0, s3[j] ); err2 = Ulp_Error( test, correct2  ); err3 =
-         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
-                                        float err5 = Ulp_Error( test, correct5
-         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
-         <= f->float_ulps)) &&
-                                                         (!(fabsf(err4) <=
-         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
-         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
-         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
-         fabsf(err ) ) err = err5;
-
-                                        // retry per section 6.5.3.4
-                                        if( IsFloatResultSubnormal(correct2,
-         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
-                                            IsFloatResultSubnormal(correct4,
-         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                        {
-                                            fail = fail && ( test != 0.0f);
-                                            if( ! fail )
-                                                err = 0.0f;
-                                        }
-
-                                        if( IsFloatSubnormal( s3[j] )  )
-                                        { // but you have to know how!
-                                            correct2 = f->func.f_fff( 0.0, 0.0,
-         0.0f ); correct3 = f->func.f_fff( -0.0, 0.0, 0.0f ); correct4 =
-         f->func.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->func.f_fff( -0.0, -0.0,
-         0.0f ); double correct6 = f->func.f_fff( 0.0, 0.0, -0.0f ); double
-         correct7 = f->func.f_fff( -0.0, 0.0, -0.0f ); double correct8 =
-         f->func.f_fff( 0.0, -0.0, -0.0f ); double correct9 = f->func.f_fff(
-         -0.0, -0.0, -0.0f ); err2 = Ulp_Error( test, correct2  ); err3 =
-         Ulp_Error( test, correct3  ); err4 = Ulp_Error( test, correct4  ); err5
-         = Ulp_Error( test, correct5  ); float err6 = Ulp_Error( test, correct6
-         ); float err7 = Ulp_Error( test, correct7  ); float err8 = Ulp_Error(
-         test, correct8  ); float err9 = Ulp_Error( test, correct9  ); fail =
-         fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <=
-         f->float_ulps)) &&
-                                                             (!(fabsf(err4) <=
-         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)) &&
-                                                             (!(fabsf(err5) <=
-         f->float_ulps)) && (!(fabsf(err6) <= f->float_ulps)) &&
-                                                             (!(fabsf(err7) <=
-         f->float_ulps)) && (!(fabsf(err8) <= f->float_ulps))); if( fabsf( err2
-         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
-         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
-         fabsf(err ) ) err = err5; if( fabsf( err6 ) < fabsf(err ) ) err = err6;
-                                            if( fabsf( err7 ) < fabsf(err ) )
-                                                err = err7;
-                                            if( fabsf( err8 ) < fabsf(err ) )
-                                                err = err8;
-                                            if( fabsf( err9 ) < fabsf(err ) )
-                                                err = err9;
-
-                                            // retry per section 6.5.3.4
-                                            if( IsFloatResultSubnormal(correct2,
-         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
-                                                IsFloatResultSubnormal(correct4,
-         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps )  ||
-                                                IsFloatResultSubnormal(
-         correct6, f->float_ulps ) || IsFloatResultSubnormal(correct7,
-         f->float_ulps )  || IsFloatResultSubnormal(correct8, f->float_ulps ) ||
-         IsFloatResultSubnormal( correct9, f->float_ulps )  )
-                                            {
-                                                fail = fail && ( test != 0.0f);
-                                                if( ! fail )
-                                                    err = 0.0f;
-                                            }
-                                        }
-                                    }
-                                    else if( IsFloatSubnormal( s3[j] ) )
-                                    {
-                                        correct2 = f->func.f_fff( 0.0, s2[j],
-         0.0 ); correct3 = f->func.f_fff( -0.0, s2[j], 0.0 ); double correct4 =
-         f->func.f_fff( 0.0,  s2[j], -0.0 ); double correct5 = f->func.f_fff(
-         -0.0, s2[j], -0.0 ); err2 = Ulp_Error( test, correct2  ); err3 =
-         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
-                                        float err5 = Ulp_Error( test, correct5
-         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
-         <= f->float_ulps)) &&
-                                                         (!(fabsf(err4) <=
-         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
-         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
-         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
-         fabsf(err ) ) err = err5;
-
-                                        // retry per section 6.5.3.4
-                                        if( IsFloatResultSubnormal(correct2,
-         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
-                                            IsFloatResultSubnormal(correct4,
-         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                        {
-                                            fail = fail && ( test != 0.0f);
-                                            if( ! fail )
-                                                err = 0.0f;
-                                        }
-                                    }
-                                }
-                                else if( fail && IsFloatSubnormal( s2[j] ) )
-                                {
-                                    double correct2 = f->func.f_fff( s[j], 0.0,
-         s3[j] ); double correct3 = f->func.f_fff( s[j], -0.0, s3[j] ); float
-         err2 = Ulp_Error( test, correct2  ); float err3 = Ulp_Error( test,
-         correct3  ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) &&
-         (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-
-                                    // retry per section 6.5.3.4
-                                    if( IsFloatResultSubnormal(correct2,
-         f->float_ulps )  || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                                    {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
-                                    }
-
-                                    //try with second two args as zero
-                                    if( IsFloatSubnormal( s3[j] ) )
-                                    {
-                                        correct2 = f->func.f_fff( s[j], 0.0, 0.0
-         ); correct3 = f->func.f_fff( s[j], -0.0, 0.0 ); double correct4 =
-         f->func.f_fff( s[j], 0.0, -0.0 ); double correct5 = f->func.f_fff(
-         s[j], -0.0, -0.0 ); err2 = Ulp_Error( test, correct2  ); err3 =
-         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
-                                        float err5 = Ulp_Error( test, correct5
-         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
-         <= f->float_ulps)) &&
-                                                         (!(fabsf(err4) <=
-         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
-         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
-         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
-         fabsf(err ) ) err = err5;
-
-                                        // retry per section 6.5.3.4
-                                        if( IsFloatResultSubnormal(correct2,
-         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
-                                            IsFloatResultSubnormal(correct4,
-         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                        {
-                                            fail = fail && ( test != 0.0f);
-                                            if( ! fail )
-                                                err = 0.0f;
-                                        }
-                                    }
-                                }
-                                else if( fail && IsFloatSubnormal(s3[j]) )
-                                {
-                                    double correct2 = f->func.f_fff( s[j],
-         s2[j], 0.0 ); double correct3 = f->func.f_fff( s[j], s2[j], -0.0 );
-                                    float err2 = Ulp_Error( test, correct2  );
-                                    float err3 = Ulp_Error( test, correct3  );
-                                    fail =  fail && ((!(fabsf(err2) <=
-         f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2
-         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
-         err3;
-
-                                    // retry per section 6.5.3.4
-                                    if( IsFloatResultSubnormal(correct2,
-         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                                    {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
-                                    }
-                                }
-                            }
-
-                            if( fabsf(err ) > maxError )
-                            {
-                                maxError = fabsf(err);
-                                maxErrorVal = s[j];
-                                maxErrorVal2 = s2[j];
-                                maxErrorVal3 = s3[j];
-                            }
-
-                            if( fail )
-                            {
-                                vlog_error( "\nERROR: %s%s: %f ulp error at {%a,
-         %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j],
-         ((float*) gOut_Ref)[j], test ); error = -1; goto exit;
-                            }
-                        }
-                    }
-                }
-        */
         if (0 == (i & 0x0fffffff))
         {
             vlog(".");
@@ -758,7 +512,6 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-    //    int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
@@ -776,11 +529,6 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         return error;
     }
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
-       i, programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -893,266 +641,8 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
 
         if (gSkipCorrectnessTesting) break;
 
-        // Verify data  -- Commented out on purpose. no verification possible.
+        // Verify data -- No verification possible.
         // MAD is a random number generator.
-        /*
-                uint64_t *t = gOut_Ref;
-                for( j = 0; j < bufferSize / sizeof( double ); j++ )
-                {
-                    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
-                    {
-                        uint64_t *q = gOut[k];
-
-                        // If we aren't getting the correctly rounded result
-                        if( t[j] != q[j] )
-                        {
-                            double test = ((double*) q)[j];
-                            long double correct = f->dfunc.f_fff( s[j], s2[j],
-         s3[j] ); float err = Bruteforce_Ulp_Error_Double( test, correct ); int
-         fail = ! (fabsf(err) <= f->double_ulps);
-
-                            if( fail && ftz )
-                            {
-                                // retry per section 6.5.3.2
-                                if( IsDoubleResultSubnormal(correct,
-         f->double_ulps) ) { // look at me, fail = fail && ( test != 0.0f ); if(
-         ! fail ) err = 0.0f;
-                                }
-
-                                // retry per section 6.5.3.3
-                                if( fail && IsDoubleSubnormal( s[j] ) )
-                                { // look at me,
-                                    long double correct2 = f->dfunc.f_fff( 0.0,
-         s2[j], s3[j] ); long double correct3 = f->dfunc.f_fff( -0.0, s2[j],
-         s3[j] ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                    float err3 = Bruteforce_Ulp_Error_Double(
-         test, correct3  ); fail =  fail && ((!(fabsf(err2) <= f->double_ulps))
-         && (!(fabsf(err3) <= f->double_ulps))); if( fabsf( err2 ) < fabsf(err )
-         ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = err3;
-
-                                    // retry per section 6.5.3.4
-                                    if( IsDoubleResultSubnormal( correct2,
-         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
-         ) { // look at me now, fail = fail && ( test != 0.0f); if( ! fail ) err
-         = 0.0f;
-                                    }
-
-                                    //try with first two args as zero
-                                    if( IsDoubleSubnormal( s2[j] ) )
-                                    { // its fun to have fun,
-                                        correct2 = f->dfunc.f_fff( 0.0, 0.0,
-         s3[j] ); correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] ); long double
-         correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] ); long double correct5 =
-         f->dfunc.f_fff( -0.0, -0.0, s3[j] ); err2 =
-         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
-         Bruteforce_Ulp_Error_Double( test, correct3  ); float err4 =
-         Bruteforce_Ulp_Error_Double( test, correct4  ); float err5 =
-         Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
-         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
-         f->double_ulps)) &&
-                                                         (!(fabsf(err4) <=
-         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
-         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
-                                            err = err3;
-                                        if( fabsf( err4 ) < fabsf(err ) )
-                                            err = err4;
-                                        if( fabsf( err5 ) < fabsf(err ) )
-                                            err = err5;
-
-                                        // retry per section 6.5.3.4
-                                        if( IsDoubleResultSubnormal( correct2,
-         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
-         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
-         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                        {
-                                            fail = fail && ( test != 0.0f);
-                                            if( ! fail )
-                                                err = 0.0f;
-                                        }
-
-                                        if( IsDoubleSubnormal( s3[j] )  )
-                                        { // but you have to know how!
-                                            correct2 = f->dfunc.f_fff( 0.0, 0.0,
-         0.0f ); correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f ); correct4 =
-         f->dfunc.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->dfunc.f_fff( -0.0,
-         -0.0, 0.0f ); long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f );
-                                            long double correct7 =
-         f->dfunc.f_fff( -0.0, 0.0, -0.0f ); long double correct8 =
-         f->dfunc.f_fff( 0.0, -0.0, -0.0f ); long double correct9 =
-         f->dfunc.f_fff( -0.0, -0.0, -0.0f ); err2 =
-         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
-         Bruteforce_Ulp_Error_Double( test, correct3  ); err4 =
-         Bruteforce_Ulp_Error_Double( test, correct4  ); err5 =
-         Bruteforce_Ulp_Error_Double( test, correct5  ); float err6 =
-         Bruteforce_Ulp_Error_Double( test, correct6  ); float err7 =
-         Bruteforce_Ulp_Error_Double( test, correct7  ); float err8 =
-         Bruteforce_Ulp_Error_Double( test, correct8  ); float err9 =
-         Bruteforce_Ulp_Error_Double( test, correct9  ); fail =  fail &&
-         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
-         f->double_ulps)) &&
-                                                             (!(fabsf(err4) <=
-         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) &&
-                                                             (!(fabsf(err5) <=
-         f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) &&
-                                                             (!(fabsf(err7) <=
-         f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps))); if( fabsf(
-         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
-                                                err = err3;
-                                            if( fabsf( err4 ) < fabsf(err ) )
-                                                err = err4;
-                                            if( fabsf( err5 ) < fabsf(err ) )
-                                                err = err5;
-                                            if( fabsf( err6 ) < fabsf(err ) )
-                                                err = err6;
-                                            if( fabsf( err7 ) < fabsf(err ) )
-                                                err = err7;
-                                            if( fabsf( err8 ) < fabsf(err ) )
-                                                err = err8;
-                                            if( fabsf( err9 ) < fabsf(err ) )
-                                                err = err9;
-
-                                            // retry per section 6.5.3.4
-                                            if( IsDoubleResultSubnormal(
-         correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3,
-         f->double_ulps )  || IsDoubleResultSubnormal( correct4, f->double_ulps
-         ) || IsDoubleResultSubnormal( correct5, f->double_ulps )  ||
-                                                IsDoubleResultSubnormal(
-         correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7,
-         f->double_ulps )  || IsDoubleResultSubnormal( correct8, f->double_ulps
-         ) || IsDoubleResultSubnormal( correct9, f->double_ulps )  )
-                                            {
-                                                fail = fail && ( test != 0.0f);
-                                                if( ! fail )
-                                                    err = 0.0f;
-                                            }
-                                        }
-                                    }
-                                    else if( IsDoubleSubnormal( s3[j] ) )
-                                    {
-                                        correct2 = f->dfunc.f_fff( 0.0, s2[j],
-         0.0 ); correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 ); long double
-         correct4 = f->dfunc.f_fff( 0.0,  s2[j], -0.0 ); long double correct5 =
-         f->dfunc.f_fff( -0.0, s2[j], -0.0 ); err2 =
-         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
-         Bruteforce_Ulp_Error_Double( test, correct3  ); float err4 =
-         Bruteforce_Ulp_Error_Double( test, correct4  ); float err5 =
-         Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
-         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
-         f->double_ulps)) &&
-                                                         (!(fabsf(err4) <=
-         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
-         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
-                                            err = err3;
-                                        if( fabsf( err4 ) < fabsf(err ) )
-                                            err = err4;
-                                        if( fabsf( err5 ) < fabsf(err ) )
-                                            err = err5;
-
-                                        // retry per section 6.5.3.4
-                                        if( IsDoubleResultSubnormal( correct2,
-         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
-         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
-         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                        {
-                                            fail = fail && ( test != 0.0f);
-                                            if( ! fail )
-                                                err = 0.0f;
-                                        }
-                                    }
-                                }
-                                else if( fail && IsDoubleSubnormal( s2[j] ) )
-                                {
-                                    long double correct2 = f->dfunc.f_fff( s[j],
-         0.0, s3[j] ); long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j]
-         ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  ); float
-         err3 = Bruteforce_Ulp_Error_Double( test, correct3  ); fail =  fail &&
-         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
-         f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if(
-         fabsf( err3 ) < fabsf(err ) ) err = err3;
-
-                                    // retry per section 6.5.3.4
-                                    if( IsDoubleResultSubnormal( correct2,
-         f->double_ulps )  || IsDoubleResultSubnormal( correct3, f->double_ulps
-         ) )
-                                    {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
-                                    }
-
-                                    //try with second two args as zero
-                                    if( IsDoubleSubnormal( s3[j] ) )
-                                    {
-                                        correct2 = f->dfunc.f_fff( s[j], 0.0,
-         0.0 ); correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 ); long double
-         correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 ); long double correct5 =
-         f->dfunc.f_fff( s[j], -0.0, -0.0 ); err2 = Bruteforce_Ulp_Error_Double(
-         test, correct2  ); err3 = Bruteforce_Ulp_Error_Double( test, correct3
-         ); float err4 = Bruteforce_Ulp_Error_Double( test, correct4  ); float
-         err5 = Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
-         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
-         f->double_ulps)) &&
-                                                         (!(fabsf(err4) <=
-         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
-         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
-                                            err = err3;
-                                        if( fabsf( err4 ) < fabsf(err ) )
-                                            err = err4;
-                                        if( fabsf( err5 ) < fabsf(err ) )
-                                            err = err5;
-
-                                        // retry per section 6.5.3.4
-                                        if( IsDoubleResultSubnormal( correct2,
-         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
-         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
-         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                        {
-                                            fail = fail && ( test != 0.0f);
-                                            if( ! fail )
-                                                err = 0.0f;
-                                        }
-                                    }
-                                }
-                                else if( fail && IsDoubleSubnormal(s3[j]) )
-                                {
-                                    long double correct2 = f->dfunc.f_fff( s[j],
-         s2[j], 0.0 ); long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0
-         ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  ); float
-         err3 = Bruteforce_Ulp_Error_Double( test, correct3  ); fail =  fail &&
-         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
-         f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if(
-         fabsf( err3 ) < fabsf(err ) ) err = err3;
-
-                                    // retry per section 6.5.3.4
-                                    if( IsDoubleResultSubnormal( correct2,
-         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
-         )
-                                    {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
-                                    }
-                                }
-                            }
-
-                            if( fabsf(err ) > maxError )
-                            {
-                                maxError = fabsf(err);
-                                maxErrorVal = s[j];
-                                maxErrorVal2 = s2[j];
-                                maxErrorVal3 = s3[j];
-                            }
-
-                            if( fail )
-                            {
-                                vlog_error( "\nERROR: %sD%s: %f ulp error at
-         {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j],
-         s3[j], ((double*) gOut_Ref)[j], test ); error = -1; goto exit;
-                            }
-                        }
-                    }
-                }
-        */
         if (0 == (i & 0x0fffffff))
         {
             vlog(".");
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 1a5a66905e..cfa5417659 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -36,9 +36,6 @@
 #define M_PI_4 (M_PI / 4)
 #endif
 
-#define EVALUATE(x) x
-#define CONCATENATE(x, y) x##EVALUATE(y)
-
 #pragma STDC FP_CONTRACT OFF
 static void __log2_ep(double *hi, double *lo, double x);
 
@@ -51,7 +48,6 @@ static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
 
 #define cl_make_nan() _CL_NAN.d
 
-static double reduce1(double x);
 static double reduce1(double x)
 {
     if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53))
@@ -71,29 +67,6 @@ static double reduce1(double x)
     return x - z;
 }
 
-/*
-static double reduceHalf( double x );
-static double reduceHalf( double x )
-{
-    if( fabs(x) >= HEX_DBL( +, 1, 0, +, 52 ) )
-    {
-        if( fabs(x) == INFINITY )
-            return cl_make_nan();
-
-        return 0.0; //we patch up the sign for sinPi and cosPi later, since they
-need different signs
-    }
-
-    // Find the nearest multiple of 1
-    const double r = copysign( HEX_DBL( +, 1, 0, +, 52 ), x );
-    double z = x + r;
-    z -= r;
-
-    // subtract it from x. Value is now in the range -0.5 <= x <= 0.5
-    return x - z;
-}
-*/
-
 double reference_acospi(double x) { return reference_acos(x) / M_PI; }
 double reference_asinpi(double x) { return reference_asin(x) / M_PI; }
 double reference_atanpi(double x) { return reference_atan(x) / M_PI; }
@@ -196,7 +169,6 @@ static float fallback_frexpf(float x, int *iptr)
     return fu;
 }
 
-static inline int extractf(float, cl_uint *);
 static inline int extractf(float x, cl_uint *mant)
 {
     static float (*frexppf)(float, int *) = NULL;
@@ -217,7 +189,6 @@ static inline int extractf(float x, cl_uint *mant)
 
 // Shift right by shift bits. Any bits lost on the right side are bitwise OR'd
 // together and ORd into the LSB of the result
-static inline void shift_right_sticky_64(cl_ulong *p, int shift);
 static inline void shift_right_sticky_64(cl_ulong *p, int shift)
 {
     cl_ulong sticky = 0;
@@ -240,7 +211,6 @@ static inline void shift_right_sticky_64(cl_ulong *p, int shift)
 
 // Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd
 // into the LSB of the result
-static inline void add64(cl_ulong *p, cl_ulong c, int *exponent);
 static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
 {
     cl_ulong carry;
@@ -260,7 +230,6 @@ static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
 }
 
 // IEEE-754 round to nearest, ties to even rounding
-static float round_to_nearest_even_float(cl_ulong p, int exponent);
 static float round_to_nearest_even_float(cl_ulong p, int exponent)
 {
     union {
@@ -312,7 +281,6 @@ static float round_to_nearest_even_float(cl_ulong p, int exponent)
     return u.d;
 }
 
-static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent);
 static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
 {
     extern int gCheckTininessBeforeRounding;
@@ -370,7 +338,6 @@ static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
 
 
 // IEEE-754 round toward zero.
-static float round_toward_zero_float(cl_ulong p, int exponent);
 static float round_toward_zero_float(cl_ulong p, int exponent)
 {
     union {
@@ -411,7 +378,6 @@ static float round_toward_zero_float(cl_ulong p, int exponent)
     return u.d;
 }
 
-static float round_toward_zero_float_ftz(cl_ulong p, int exponent);
 static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
 {
     extern int gCheckTininessBeforeRounding;
@@ -452,7 +418,6 @@ static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
 }
 
 // Subtract two significands.
-static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC);
 static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC)
 {
     cl_ulong carry;
@@ -688,9 +653,6 @@ double reference_minmag(double x, double y)
     return reference_fmin(x, y);
 }
 
-// double my_nextafter( double x, double y ){  return (double) nextafterf(
-// (float) x, (float) y ); }
-
 double reference_relaxed_mad(double a, double b, double c)
 {
     return ((float)a) * ((float)b) + (float)c;
@@ -733,7 +695,7 @@ double reference_rootn(double x, int i)
 }
 
 double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); }
-// double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); }
+
 double reference_sinpi(double x)
 {
     double r = reduce1(x);
@@ -888,7 +850,6 @@ double reference_fract(double x, double *ip)
 }
 
 
-// double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); }
 double reference_add(double x, double y)
 {
     volatile float a = (float)x;
@@ -1005,8 +966,6 @@ double reference_subtract(double x, double y)
     return a;
 }
 
-// double reference_divide( double x, double y ){ return (float) x / (float) y;
-// }
 double reference_multiply(double x, double y)
 {
     volatile float a = (float)x;
@@ -1080,18 +1039,6 @@ double reference_multiply(double x, double y)
     return a;
 }
 
-/*double my_remquo( double x, double y, int *iptr )
-{
-    if( isnan(x) || isnan(y) ||
-        fabs(x) == INFINITY  ||
-        y == 0.0 )
-    {
-        *iptr = 0;
-        return NAN;
-    }
-
-    return (double) remquof( (float) x, (float) y, iptr );
-}*/
 double reference_lgamma_r(double x, int *signp)
 {
     // This is not currently tested
@@ -1188,22 +1135,6 @@ double reference_cbrt(double x)
     return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x);
 }
 
-/*
-double reference_scalbn(double x, int i)
-{ // suitable for checking single precision scalbnf only
-
-    if( i > 300 )
-        return copysign( INFINITY, x);
-    if( i < -300 )
-        return copysign( 0.0, x);
-
-    union{ cl_ulong u; double d;} u;
-    u.u = ((cl_ulong) i + 1023) << 52;
-
-    return x * u.d;
-}
-*/
-
 double reference_rint(double x)
 {
     if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52))
@@ -1763,12 +1694,6 @@ int reference_ilogbl(long double x)
     return exponent - 1023;
 }
 
-// double reference_log2( double x )
-//{
-//    return log( x ) * 1.44269504088896340735992468100189214;
-//}
-
-
 double reference_relaxed_log2(double x) { return reference_log2(x); }
 
 double reference_log2(double x)
@@ -2487,32 +2412,6 @@ static inline double_double mul_dd(double_double a, double_double b)
     // the last 3 terms are two low to appear in the result
 
 
-    // accumulate from bottom up
-#if 0
-    // works but slow
-    result.hi = pC;
-    result = accum_d( result, pB );
-    result = accum_d( result, p7 );
-    result = accum_d( result, pA );
-    result = accum_d( result, p9 );
-    result = accum_d( result, p6 );
-    result = accum_d( result, p5 );
-    result = accum_d( result, p8 );
-    result = accum_d( result, p4 );
-    result = accum_d( result, p3 );
-    result = accum_d( result, p2 );
-    result = accum_d( result, p1 );
-    result = accum_d( result, p0 );
-
-    // canonicalize the result
-    double temp = result.hi;
-    result.hi += result.lo;
-    result.lo -= (result.hi - temp);
-    if( isnan( result.lo ) )
-        result.lo = 0.0;
-
-    return result;
-#else
     // take advantage of the known relative magnitudes of the partial products
     // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit
     // sloppy about canonicalizing the double_doubles
@@ -2554,7 +2453,6 @@ static inline double_double mul_dd(double_double a, double_double b)
 
     // Add in MSB's, and round to precision
     return accum_d(t1, p0); // canonicalizes
-#endif
 }
 
 
@@ -2742,7 +2640,6 @@ static double fallback_frexp(double x, int *iptr)
 }
 
 // Assumes zeros, infinities and NaNs handed elsewhere
-static inline int extract(double x, cl_ulong *mant);
 static inline int extract(double x, cl_ulong *mant)
 {
     static double (*frexpp)(double, int *) = NULL;
@@ -2762,7 +2659,6 @@ static inline int extract(double x, cl_ulong *mant)
 }
 
 // Return 128-bit product of a*b  as (hi << 64) + lo
-static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo);
 static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
 {
     cl_ulong alo = a & 0xffffffffULL;
@@ -2798,8 +2694,6 @@ static inline void renormalize(cl_ulong *hi, cl_ulong *lo, int *exponent)
     }
 }
 
-static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
-                                           int exponent);
 static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
                                            int exponent)
 {
@@ -2846,8 +2740,6 @@ static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
 
 // Shift right.  Bits lost on the right will be OR'd together and OR'd with the
 // LSB
-static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo,
-                                          int shift);
 static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
 {
     cl_ulong sticky = 0;
@@ -2886,8 +2778,6 @@ static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
 // 128-bit add  of ((*hi << 64) + *lo) + ((chi << 64) + clo)
 // If the 129 bit result doesn't fit, bits lost off the right end will be OR'd
 // with the LSB
-static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
-                          cl_ulong clo, int *exp);
 static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
                           cl_ulong clo, int *exponent)
 {
@@ -2915,8 +2805,6 @@ static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
 }
 
 // 128-bit subtract  of ((chi << 64) + clo)  - ((*hi << 64) + *lo)
-static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
-                          cl_ulong lo, cl_ulong *signC, int *expC);
 static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
                           cl_ulong lo, cl_ulong *signC, int *expC)
 {
@@ -3096,9 +2984,6 @@ long double reference_madl(long double a, long double b, long double c)
     return a * b + c;
 }
 
-// long double my_nextafterl(long double x, long double y){  return (long
-// double) nextafter( (double) x, (double) y ); }
-
 long double reference_recipl(long double x) { return 1.0L / x; }
 
 long double reference_rootnl(long double x, int i)
@@ -3150,8 +3035,7 @@ long double reference_rootnl(long double x, int i)
 }
 
 long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); }
-// long double reference_sincosl( long double x, long double *c ){ *c =
-// reference_cosl(x); return reference_sinl(x); }
+
 long double reference_sinpil(long double x)
 {
     double r = reduce1l(x);
@@ -3263,8 +3147,6 @@ long double reference_powrl(long double x, long double y)
     return reference_powl(x, y);
 }
 
-// long double my_fdiml( long double x, long double y){ return fdim( (double) x,
-// (double) y ); }
 long double reference_addl(long double x, long double y)
 {
     volatile double a = (double)x;
@@ -3316,27 +3198,12 @@ long double reference_multiplyl(long double x, long double y)
     return (long double)a;
 }
 
-/*long double my_remquol( long double x, long double y, int *iptr )
-{
-    if( isnan(x) || isnan(y) ||
-        fabs(x) == INFINITY  ||
-        y == 0.0 )
-    {
-        *iptr = 0;
-        return NAN;
-    }
-
-    return remquo( (double) x, (double) y, iptr );
-}*/
 long double reference_lgamma_rl(long double x, int *signp)
 {
-    //    long double lgamma_val = (long double)reference_lgamma( (double)x );
-    //    *signp = signgam;
     *signp = 0;
     return x;
 }
 
-
 int reference_isequall(long double x, long double y) { return x == y; }
 int reference_isfinitel(long double x) { return 0 != isfinite(x); }
 int reference_isgreaterl(long double x, long double y) { return x > y; }
@@ -3457,45 +3324,6 @@ long double reference_cbrtl(long double x)
     return reference_copysignl(powxy, x);
 }
 
-/*
-long double scalbnl( long double x, int i )
-{
-    //suitable for checking double precision scalbn only
-
-    if( i > 3000 )
-        return copysignl( INFINITY, x);
-    if( i < -3000 )
-        return copysignl( 0.0L, x);
-
-    if( i > 0 )
-    {
-        while( i >= 1000 )
-        {
-            x *= HEX_LDBL( +, 1, 0, +, 1000 );
-            i -= 1000;
-        }
-
-        union{ cl_ulong u; double d;}u;
-        u.u = (cl_ulong)( i + 1023 ) << 52;
-        x *= (long double) u.d;
-    }
-    else if( i < 0 )
-    {
-        while( i <= -1000 )
-        {
-            x *= HEX_LDBL( +, 1, 0, -, 1000 );
-            i += 1000;
-        }
-
-        union{ cl_ulong u; double d;}u;
-        u.u = (cl_ulong)( i + 1023 ) << 52;
-        x *= (long double) u.d;
-    }
-
-    return x;
-}
-*/
-
 long double reference_rintl(long double x)
 {
 #if defined(__PPC__)
@@ -3845,11 +3673,6 @@ long double reference_hypotl(long double x, long double y)
     return sqrtl(x * x + y * y);
 }
 
-// long double reference_log2l( long double x )
-//{
-//    return log( x ) * 1.44269504088896340735992468100189214L;
-//}
-
 long double reference_log2l(long double x)
 {
     if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN;
@@ -3940,14 +3763,12 @@ long double reference_nanl(cl_ulong x)
 
 long double reference_reciprocall(long double x) { return 1.0L / x; }
 
-long double reference_remainderl(long double x, long double y);
 long double reference_remainderl(long double x, long double y)
 {
     int i;
     return reference_remquol(x, y, &i);
 }
 
-long double reference_lgammal(long double x);
 long double reference_lgammal(long double x)
 {
     // lgamma is currently not tested
@@ -3996,8 +3817,6 @@ typedef struct
     int sign; // sign of double
 } eprep_t;
 
-static eprep_t double_to_eprep(double x);
-
 static eprep_t double_to_eprep(double x)
 {
     eprep_t result;
@@ -4029,88 +3848,6 @@ static eprep_t double_to_eprep(double x)
     return result;
 }
 
-/*
- double eprep_to_double( uint32_t *R, int digits, int index, int sgn )
- {
- d_ui64_t nb, rndcorr;
- uint64_t lowpart, roundbits, t1;
- int expo, expofinal, shift;
- double res;
-
- nb.d = (double) R[0];
-
- t1   = R[1];
- lowpart  = (t1 << RADIX) + R[2];
- expo = ((nb.u & 0x7ff0000000000000ULL) >> 52) - 1023;
-
- expofinal = expo + RADIX*index;
-
- if (expofinal >  1023) {
- d_ui64_t inf = { 0x7ff0000000000000ULL };
- res = inf.d;
- }
-
- else if (expofinal >= -1022){
- shift = expo + 2*RADIX - 53;
- roundbits = lowpart << (64-shift);
- lowpart = lowpart >> shift;
- if (lowpart & 0x0000000000000001ULL) {
- if(roundbits == 0) {
- int i;
- for (i=3; i < digits; i++)
- roundbits = roundbits | R[i];
- }
- if(roundbits == 0) {
- if (lowpart & 0x0000000000000002ULL)
- rndcorr.u = (uint64_t) (expo - 52 + 1023) << 52;
- else
- rndcorr.d = 0.0;
- }
- else
- rndcorr.u = (uint64_t) (expo - 52 + 1023) << 52;
- }
- else{
- rndcorr.d = 0.0;
- }
-
- lowpart = lowpart >> 1;
- nb.u = nb.u | lowpart;
- res  = nb.d + rndcorr.d;
-
- if(index*RADIX + 1023 > 0) {
- nb.u = 0;
- nb.u = (uint64_t) (index*RADIX + 1023) << 52;
- res *= nb.d;
- }
- else {
- nb.u = 0;
- nb.u = (uint64_t) (index*RADIX + 1023 + 2*RADIX) << 52;
- res *= two_pow_two_mradix.d;
- res *= nb.d;
- }
- }
- else {
- if (expofinal < -1022 - 53 ) {
- res = 0.0;
- }
- else {
- lowpart = lowpart >> (expo + (2*RADIX) - 52);
- nb.u = nb.u | lowpart;
- nb.u = (nb.u & 0x000FFFFFFFFFFFFFULL) | 0x0010000000000000ULL;
- nb.u = nb.u >> (-1023 - expofinal);
- if(nb.u & 0x0000000000000001ULL)
- rndcorr.u = 1;
- else
- rndcorr.d = 0.0;
- res  = 0.5*(nb.d + rndcorr.d);
- }
- }
-
- return sgn*res;
- }
- */
-static double eprep_to_double(eprep_t epx);
-
 static double eprep_to_double(eprep_t epx)
 {
     double res = 0.0;
@@ -4122,8 +3859,6 @@ static double eprep_to_double(eprep_t epx)
     return copysign(res, epx.sign);
 }
 
-static int payne_hanek(double *y, int *exception);
-
 static int payne_hanek(double *y, int *exception)
 {
     double x = *y;
@@ -4812,8 +4547,6 @@ static void __log2_ep(double *hi, double *lo, double x)
 
 long double reference_powl(long double x, long double y)
 {
-
-
     // this will be used for testing doubles i.e. arguments will
     // be doubles so cast the input back to double ... returned
     // result will be long double though .... > 53 bits of precision
@@ -4827,9 +4560,6 @@ long double reference_powl(long double x, long double y)
     // causes errors. So we need to tread y as long double and convert it
     // to hi, lo doubles when performing y*log2(x).
 
-    //    double x = (double) xx;
-    //    double y = (double) yy;
-
     static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
 
     // if x = 1, return x for any y, even NaN
@@ -5094,8 +4824,6 @@ double reference_remquo(double xd, double yd, int *n)
 
         if (ex - ey >= 0)
         {
-
-
             int i;
             for (i = ex - ey; i > 0; i--)
             {
@@ -5137,7 +4865,6 @@ double reference_remquo(double xd, double yd, int *n)
 
 long double reference_remquol(long double xd, long double yd, int *n)
 {
-
     double xx = (double)xd;
     double yy = (double)yd;
 
@@ -5174,14 +4901,12 @@ long double reference_remquol(long double xd, long double yd, int *n)
 
     if (ex - ey >= -1)
     {
-
         yr = reference_ldexp(y, -ey);
         xr = reference_ldexp(x, -ex);
         int i;
 
         if (ex - ey >= 0)
         {
-
             for (i = ex - ey; i > 0; i--)
             {
                 q <<= 1;
@@ -5484,10 +5209,6 @@ long double reference_logl(long double x)
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
 
-    // double rhi, rlo;
-    // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
-    // return (long double) rhi + (long double) rlo;
-
     long double lg2 = (long double)log2Hi + (long double)log2Lo;
     long double logx = (long double)logxHi + (long double)logxLo;
     return logx * lg2;
@@ -5872,10 +5593,6 @@ long double reference_log10l(long double x)
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
 
-    // double rhi, rlo;
-    // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
-    // return (long double) rhi + (long double) rlo;
-
     long double lg2 = (long double)log2Hi + (long double)log2Lo;
     long double logx = (long double)logxHi + (long double)logxLo;
     return logx * lg2;
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index 448a7c3db9..015dbc25db 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -360,11 +360,6 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
         return error;
-    /*
-     for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-     if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs +
-     i) ) ) return error;
-     */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -1189,11 +1184,6 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     {
         return error;
     }
-    /*
-     for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-     if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i,
-     programs + i) ) ) return error;
-     */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index d468d26de0..177239adb1 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -261,11 +261,6 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
         return error;
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
-       programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -796,11 +791,6 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         return error;
     }
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
-       i, programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index c71de0ed39..b79f91a2fd 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -271,11 +271,6 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
         return error;
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
-       programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -632,11 +627,6 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         return error;
     }
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
-       i, programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 397ff877ee..43e997ef8c 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -252,11 +252,6 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
         return error;
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
-       programs + i) ) ) return error;
-    */
 
     if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
     {
@@ -563,11 +558,6 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     {
         return error;
     }
-    /*
-        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
-       i, programs + i) ) ) return error;
-    */
 
     for (i = 0; i < (1ULL << 32); i += step)
     {

From 0130c24fe5da57e74949466574c8ec4969e68121 Mon Sep 17 00:00:00 2001
From: Nikhil Joshi <nikhilj@nvidia.com>
Date: Thu, 21 Jan 2021 19:01:54 +0530
Subject: [PATCH 022/158]  Fix malloc-size calculation in test imagedim (#1100)

* Fix enqueue_flags test to use correct barrier type.

Currently, enqueue_flags test uses CLK_LOCAL_MEM_FENCE.
Use CLK_GLOBAL_MEM_FENCE instead as all threads across work-groups
need to wait here.

* Add check for support for Read-Wrie images

Read-Write images have required OpenCL 2.x.
Read-Write image tests are already being skipped
for 1.x devices.
With OpenCL 3.0, read-write images being optional,
the tests should be run or skipped
depending on the implementation support.

Add a check to decide if Read-Write images are
supported or required to be supported depending
on OpenCL version and decide if the tests should
be run on skipped.

Fixes issue #894

* Fix formatting in case of Read-Write image checks.

Fix formatting in case of Read-write image checks.
Also, combine two ifs into one in case of
kerne_read_write tests

* Fix some more formatting for RW-image checks

Remove unnecessary spaces at various places.
Also, fix lengthy lines.

* Fix malloc-size calculation in test imagedim

unsigned char size is silently assumed to be 1
in imagedim test of test_basic.
Pass sizeof(type) in malloc size calculation.
Also, change loop variable from signed to unsigned.
Add checks for null pointer for malloced memory.

* Use size_t instead of int for imagedim

The size calculation for image with larger dimensions
is overflowing with int values.
Change image dim variables to use size_t
to avoid overflow.
While at it, fix formatting at various places.

* Use new instead of malloc in test imagedim

Use new and delete in place of malloc
and free through test_basic imagedim
to avoid NULL pointer checks.

* Revert sizeof changes from size calculation

As the types of width and height are now
changed to size_t, sizeof is not required
in size calculation.
Revert the same.
---
 test_conformance/basic/test_imagedim.cpp | 158 ++++++++++++-----------
 1 file changed, 80 insertions(+), 78 deletions(-)

diff --git a/test_conformance/basic/test_imagedim.cpp b/test_conformance/basic/test_imagedim.cpp
index 6064655f0c..008c88b6af 100644
--- a/test_conformance/basic/test_imagedim.cpp
+++ b/test_conformance/basic/test_imagedim.cpp
@@ -38,24 +38,25 @@ static const char *image_dim_kernel_code =
 "}\n";
 
 
-static unsigned char *
-generate_8888_image(int w, int h, MTdata d)
+static unsigned char *generate_8888_image(size_t w, size_t h, MTdata d)
 {
-    unsigned char   *ptr = (unsigned char*)malloc(w * h * 4);
-    int             i;
+    unsigned char *ptr = new unsigned char[4 * w * h];
+    size_t i;
 
-    for (i=0; i<w*h*4; i++)
+    for (i = 0; i < w * h * 4; i++)
+    {
         ptr[i] = (unsigned char)genrand_int32(d);
+    }
 
     return ptr;
 }
 
-static int
-verify_8888_image(unsigned char *image, unsigned char *outptr, int w, int h)
+static int verify_8888_image(unsigned char *image, unsigned char *outptr,
+                             size_t w, size_t h)
 {
-    int     i;
+    size_t i;
 
-    for (i=0; i<w*h; i++)
+    for (i = 0; i < w * h; i++)
     {
         if (outptr[i] != image[i])
             return -1;
@@ -68,18 +69,18 @@ verify_8888_image(unsigned char *image, unsigned char *outptr, int w, int h)
 int
 test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
 {
-    cl_mem            streams[2];
-    cl_image_format    img_format;
-    unsigned char    *input_ptr, *output_ptr;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    threads[2];
-     cl_ulong    max_mem_size;
-    int                img_width, max_img_width;
-    int                img_height, max_img_height;
-    int                max_img_dim;
-    int                i, j, i2, j2, err=0;
-    size_t            max_image2d_width, max_image2d_height;
+    cl_mem streams[2];
+    cl_image_format img_format;
+    unsigned char *input_ptr, *output_ptr;
+    cl_program program;
+    cl_kernel kernel;
+    size_t threads[2];
+    cl_ulong max_mem_size;
+    size_t img_width, max_img_width;
+    size_t img_height, max_img_height;
+    size_t max_img_dim;
+    int i, j, i2, j2, err = 0;
+    size_t max_image2d_width, max_image2d_height;
     int total_errors = 0;
     MTdata  d;
 
@@ -120,15 +121,15 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
     cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err);
     test_error(err, "clCreateSampler failed");
 
-    max_img_width = (int)max_image2d_width;
-    max_img_height = (int)max_image2d_height;
+    max_img_width = max_image2d_width;
+    max_img_height = max_image2d_height;
 
     // determine max image dim we can allocate - assume RGBA image, 4 bytes per pixel,
   //  and we want to consume 1/4 of global memory (this is the minimum required to be
   //  supported by the spec)
     max_mem_size /= 4; // use 1/4
     max_mem_size /= 4; // 4 bytes per pixel
-    max_img_dim = (int)sqrt((double)max_mem_size);
+    max_img_dim = (size_t)sqrt((double)max_mem_size);
     // convert to a power of 2
     {
         unsigned int    n = (unsigned int)max_img_dim;
@@ -138,7 +139,7 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
         while (m > n)
             m >>= 1;
 
-        max_img_dim = (int)m;
+        max_img_dim = m;
     }
 
     if (max_img_width > max_img_dim)
@@ -151,13 +152,14 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
 
     d = init_genrand( gRandomSeed );
     input_ptr = generate_8888_image(max_img_width, max_img_height, d);
-    output_ptr = (unsigned char*)malloc(sizeof(unsigned char) * 4 * max_img_width * max_img_height);
+
+    output_ptr = new unsigned char[4 * max_img_width * max_img_height];
 
     // test power of 2 width, height starting at 1 to 4K
-    for (i=1,i2=0; i<=max_img_height; i<<=1,i2++)
+    for (i = 1, i2 = 0; i <= max_img_height; i <<= 1, i2++)
     {
         img_height = (1 << i2);
-        for (j=1,j2=0; j<=max_img_width; j<<=1,j2++)
+        for (j = 1, j2 = 0; j <= max_img_width; j <<= 1, j2++)
         {
             img_width = (1 << j2);
 
@@ -169,8 +171,8 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
             if (!streams[0])
             {
                 log_error("create_image_2d failed.  width = %d, height = %d\n", img_width, img_height);
-                free(input_ptr);
-                free(output_ptr);
+                delete[] input_ptr;
+                delete[] output_ptr;
                 free_mtdata(d);
                 return -1;
             }
@@ -183,8 +185,8 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
             {
                 log_error("create_image_2d failed.  width = %d, height = %d\n", img_width, img_height);
                 clReleaseMemObject(streams[0]);
-                free(input_ptr);
-                free(output_ptr);
+                delete[] input_ptr;
+                delete[] output_ptr;
                 free_mtdata(d);
                 return -1;
             }
@@ -197,8 +199,8 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
                 log_error("clWriteImage failed\n");
                 clReleaseMemObject(streams[0]);
                 clReleaseMemObject(streams[1]);
-                free(input_ptr);
-                free(output_ptr);
+                delete[] input_ptr;
+                delete[] output_ptr;
                 free_mtdata(d);
                 return -1;
             }
@@ -211,8 +213,8 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
                 log_error("clSetKernelArgs failed\n");
                 clReleaseMemObject(streams[0]);
                 clReleaseMemObject(streams[1]);
-                free(input_ptr);
-                free(output_ptr);
+                delete[] input_ptr;
+                delete[] output_ptr;
                 free_mtdata(d);
                 return -1;
             }
@@ -228,8 +230,8 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
                             img_width, img_height);
                 clReleaseMemObject(streams[0]);
                 clReleaseMemObject(streams[1]);
-                free(input_ptr);
-                free(output_ptr);
+                delete[] input_ptr;
+                delete[] output_ptr;
                 free_mtdata(d);
                 return -1;
             }
@@ -241,8 +243,8 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
                             img_width, img_height);
                 clReleaseMemObject(streams[0]);
                 clReleaseMemObject(streams[1]);
-                free(input_ptr);
-                free(output_ptr);
+                delete[] input_ptr;
+                delete[] output_ptr;
                 free_mtdata(d);
                 return -1;
             }
@@ -259,8 +261,8 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
     }
 
     // cleanup
-    free(input_ptr);
-    free(output_ptr);
+    delete[] input_ptr;
+    delete[] output_ptr;
     free_mtdata(d);
     clReleaseSampler(sampler);
     clReleaseKernel(kernel);
@@ -274,18 +276,18 @@ test_imagedim_pow2(cl_device_id device, cl_context context, cl_command_queue que
 int
 test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
 {
-    cl_mem            streams[2];
-    cl_image_format    img_format;
-    unsigned char    *input_ptr, *output_ptr;
-    cl_program        program;
-    cl_kernel        kernel;
-    size_t    threads[2], local_threads[2];
-    cl_ulong    max_mem_size;
-    int                img_width, max_img_width;
-    int                img_height, max_img_height;
-    int                max_img_dim;
-    int                i, j, i2, j2, err=0;
-    size_t            max_image2d_width, max_image2d_height;
+    cl_mem streams[2];
+    cl_image_format img_format;
+    unsigned char *input_ptr, *output_ptr;
+    cl_program program;
+    cl_kernel kernel;
+    size_t threads[2], local_threads[2];
+    cl_ulong max_mem_size;
+    size_t img_width, max_img_width;
+    size_t img_height, max_img_height;
+    size_t max_img_dim;
+    int i, j, i2, j2, err = 0;
+    size_t max_image2d_width, max_image2d_height;
     int total_errors = 0;
     size_t max_local_workgroup_size[3];
     MTdata d;
@@ -365,10 +367,10 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
 
     d = init_genrand( gRandomSeed );
     input_ptr = generate_8888_image(max_img_width, max_img_height, d);
-    output_ptr = (unsigned char*)malloc(sizeof(unsigned char) * 4 * max_img_width * max_img_height);
+    output_ptr = new unsigned char[4 * max_img_width * max_img_height];
 
     int plus_minus;
-    for (plus_minus=0; plus_minus < 3; plus_minus++)
+    for (plus_minus = 0; plus_minus < 3; plus_minus++)
     {
 
     // test power of 2 width, height starting at 1 to 4K
@@ -379,8 +381,8 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
             {
                 img_width = (1 << j2);
 
-                int effective_img_height = img_height;
-                int effective_img_width = img_width;
+                size_t effective_img_height = img_height;
+                size_t effective_img_width = img_width;
 
                 local_threads[0] = 1;
                 local_threads[1] = 1;
@@ -414,8 +416,8 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
                 if (!streams[0])
                 {
                     log_error("create_image_2d failed.  width = %d, height = %d\n", effective_img_width, effective_img_height);
-                    free(input_ptr);
-                    free(output_ptr);
+                    delete[] input_ptr;
+                    delete[] output_ptr;
                     free_mtdata(d);
                     return -1;
                 }
@@ -428,8 +430,8 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
                 {
                     log_error("create_image_2d failed.  width = %d, height = %d\n", effective_img_width, effective_img_height);
                     clReleaseMemObject(streams[0]);
-                    free(input_ptr);
-                    free(output_ptr);
+                    delete[] input_ptr;
+                    delete[] output_ptr;
                     free_mtdata(d);
                     return -1;
                 }
@@ -442,8 +444,8 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
                     log_error("clWriteImage failed\n");
                     clReleaseMemObject(streams[0]);
                     clReleaseMemObject(streams[1]);
-                    free(input_ptr);
-                    free(output_ptr);
+                    delete[] input_ptr;
+                    delete[] output_ptr;
                     free_mtdata(d);
                     return -1;
                 }
@@ -456,8 +458,8 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
                     log_error("clSetKernelArgs failed\n");
                     clReleaseMemObject(streams[0]);
                     clReleaseMemObject(streams[1]);
-                    free(input_ptr);
-                    free(output_ptr);
+                    delete[] input_ptr;
+                    delete[] output_ptr;
                     free_mtdata(d);
                     return -1;
                 }
@@ -474,8 +476,8 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
                                 effective_img_width, effective_img_height, (int)local_threads[0], (int)local_threads[1]);
                     clReleaseMemObject(streams[0]);
                     clReleaseMemObject(streams[1]);
-                    free(input_ptr);
-                    free(output_ptr);
+                    delete[] input_ptr;
+                    delete[] output_ptr;
                     free_mtdata(d);
                     return -1;
                 }
@@ -487,8 +489,8 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
                                 effective_img_width, effective_img_height, (int)local_threads[0], (int)local_threads[1]);
                     clReleaseMemObject(streams[0]);
                     clReleaseMemObject(streams[1]);
-                    free(input_ptr);
-                    free(output_ptr);
+                    delete[] input_ptr;
+                    delete[] output_ptr;
                     free_mtdata(d);
                     return -1;
                 }
@@ -506,15 +508,15 @@ test_imagedim_non_pow2(cl_device_id device, cl_context context, cl_command_queue
 
   }
 
-    // cleanup
-    free(input_ptr);
-    free(output_ptr);
-    free_mtdata(d);
-    clReleaseSampler(sampler);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
+  // cleanup
+  delete[] input_ptr;
+  delete[] output_ptr;
+  free_mtdata(d);
+  clReleaseSampler(sampler);
+  clReleaseKernel(kernel);
+  clReleaseProgram(program);
 
-    return total_errors;
+  return total_errors;
 }
 
 

From bf883dc8009ab038791fe1824add956a36b62ad7 Mon Sep 17 00:00:00 2001
From: ellnor01 <51320439+ellnor01@users.noreply.github.com>
Date: Fri, 22 Jan 2021 15:28:47 +0000
Subject: [PATCH 023/158] Fix compilation issues for conforming_version.cpp
 (#1113)

Previously this file used the auto keyword to
declare a string. This can cause compilation
issues when used in a later function.

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>
---
 test_conformance/computeinfo/conforming_version.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_conformance/computeinfo/conforming_version.cpp b/test_conformance/computeinfo/conforming_version.cpp
index 8c7eb29d50..624cf85a22 100644
--- a/test_conformance/computeinfo/conforming_version.cpp
+++ b/test_conformance/computeinfo/conforming_version.cpp
@@ -21,7 +21,7 @@
 int test_conformance_version(cl_device_id deviceID, cl_context context,
                              cl_command_queue ignoreQueue, int num_elements)
 {
-    auto version_string{ get_device_info_string(
+    std::string version_string{ get_device_info_string(
         deviceID, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED) };
 
     // Latest conformance version passed should match vYYYY-MM-DD-XX, where XX
@@ -34,4 +34,4 @@ int test_conformance_version(cl_device_id deviceID, cl_context context,
         "valid format vYYYY-MM-DD-XX");
 
     return TEST_PASS;
-}
\ No newline at end of file
+}

From f337e0b6f9878c2e8f3c3fdad3ba712f2f88be14 Mon Sep 17 00:00:00 2001
From: Nikhil Joshi <nikhilj@nvidia.com>
Date: Fri, 29 Jan 2021 19:43:54 +0530
Subject: [PATCH 024/158]  Fix command-line function range for bruteforce 
 (#1127)

* Fix enqueue_flags test to use correct barrier type.

Currently, enqueue_flags test uses CLK_LOCAL_MEM_FENCE.
Use CLK_GLOBAL_MEM_FENCE instead as all threads across work-groups
need to wait here.

* Add check for support for Read-Wrie images

Read-Write images have required OpenCL 2.x.
Read-Write image tests are already being skipped
for 1.x devices.
With OpenCL 3.0, read-write images being optional,
the tests should be run or skipped
depending on the implementation support.

Add a check to decide if Read-Write images are
supported or required to be supported depending
on OpenCL version and decide if the tests should
be run on skipped.

Fixes issue #894

* Fix formatting in case of Read-Write image checks.

Fix formatting in case of Read-write image checks.
Also, combine two ifs into one in case of
kerne_read_write tests

* Fix some more formatting for RW-image checks

Remove unnecessary spaces at various places.
Also, fix lengthy lines.

* Fix malloc-size calculation in test imagedim

unsigned char size is silently assumed to be 1
in imagedim test of test_basic.
Pass sizeof(type) in malloc size calculation.
Also, change loop variable from signed to unsigned.
Add checks for null pointer for malloced memory.

* Fix command-line function range for bruteforce

Runnning "test_bruteforce N M" is expected to skip
first N functions and test M functions after it.
When N is 0, the test currently skips M functions
and run all functions thereafter.
Fix the test to honor semantics of these
command-line options to correctly test
first M functions when N is 0.
---
 test_conformance/math_brute_force/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index ca58f2e5fc..254b22aac6 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -56,8 +56,8 @@ char appName[MAXPATHLEN] = "";
 cl_device_id gDevice = NULL;
 cl_context gContext = NULL;
 cl_command_queue gQueue = NULL;
-static int32_t gStartTestNumber;
-static int32_t gEndTestNumber;
+static int32_t gStartTestNumber = -1;
+static int32_t gEndTestNumber = -1;
 int gSkipCorrectnessTesting = 0;
 int gStopOnError = 0;
 static bool gSkipRestOfTests;
@@ -1005,7 +1005,7 @@ static int ParseArgs(int argc, const char **argv)
             long number = strtol(arg, &t, 0);
             if (t != arg)
             {
-                if (0 == gStartTestNumber)
+                if (-1 == gStartTestNumber)
                     gStartTestNumber = (int32_t)number;
                 else
                     gEndTestNumber = gStartTestNumber + (int32_t)number;

From 31fafb0ff5f4164fd857758d369cfde82304be17 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Fri, 29 Jan 2021 14:14:45 +0000
Subject: [PATCH 025/158] Install generate_spirv_offline.py script (#1109)

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_conformance/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_conformance/CMakeLists.txt b/test_conformance/CMakeLists.txt
index b9b87c1d35..87d68597f0 100644
--- a/test_conformance/CMakeLists.txt
+++ b/test_conformance/CMakeLists.txt
@@ -58,7 +58,10 @@ add_subdirectory( spir )
 
 file(GLOB CSV_FILES "opencl_conformance_tests_*.csv")
 
-set(PY_FILES run_conformance.py)
+set(PY_FILES
+    generate_spirv_offline.py
+    run_conformance.py
+)
 
 # Copy .csv files
 foreach(FILE ${CSV_FILES})

From c587b45a2b99c9270d6562b056bfa6972a47d754 Mon Sep 17 00:00:00 2001
From: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com>
Date: Fri, 29 Jan 2021 14:15:16 +0000
Subject: [PATCH 026/158] Minor fixes for CL_ARGB channel order. (#1128)

Signed-off-by: John Kesapides <john.kesapides@arm.com>
Change-Id: I4f6bbce14535f6156365a5a46c4739d6a7257ab2
---
 test_common/harness/imageHelpers.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index db9cf3f6ee..b9cbfe7a1c 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017,2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -550,6 +550,7 @@ int has_alpha(const cl_image_format *format)
         case CL_RGBA: return 1;
         case CL_BGRA: return 1;
         case CL_ARGB: return 1;
+        case CL_ABGR: return 1;
         case CL_INTENSITY: return 1;
         case CL_LUMINANCE: return 0;
 #ifdef CL_BGR1_APPLE
@@ -1416,6 +1417,12 @@ void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
             outData[2] = tempData[3];
             outData[3] = tempData[0];
             break;
+        case CL_ABGR:
+            outData[0] = tempData[3];
+            outData[1] = tempData[2];
+            outData[2] = tempData[1];
+            outData[3] = tempData[0];
+            break;
         case CL_BGRA:
         case CL_sBGRA:
             outData[0] = tempData[2];
@@ -2398,6 +2405,14 @@ void swizzle_vector_for_image(T *srcVector, const cl_image_format *imageFormat)
             srcVector[1] = srcVector[0];
             srcVector[0] = temp;
             break;
+        case CL_ABGR:
+            temp = srcVector[3];
+            srcVector[3] = srcVector[0];
+            srcVector[0] = temp;
+            temp = srcVector[2];
+            srcVector[2] = srcVector[1];
+            srcVector[1] = temp;
+            break;
         case CL_BGRA:
         case CL_sBGRA:
             temp = srcVector[0];

From 5930853a9b9e69c675adcf7d65605adb9cf25222 Mon Sep 17 00:00:00 2001
From: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com>
Date: Fri, 29 Jan 2021 14:16:55 +0000
Subject: [PATCH 027/158] Minor fixes for clCopyImage. (#1130)

* Minor fixes for clCopyImage.

Signed-off-by: John Kesapides <john.kesapides@arm.com>
Change-Id: I63c47570e45580e5e29716a46929cb1127711c6b

* Convert comment to CTS style.
Return error code instead of -1 in clFinish.
---
 .../images/clCopyImage/test_copy_generic.cpp  | 79 +++++++++----------
 1 file changed, 38 insertions(+), 41 deletions(-)

diff --git a/test_conformance/images/clCopyImage/test_copy_generic.cpp b/test_conformance/images/clCopyImage/test_copy_generic.cpp
index d56ae7706c..577fa47b32 100644
--- a/test_conformance/images/clCopyImage/test_copy_generic.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_generic.cpp
@@ -105,23 +105,41 @@ cl_mem create_image( cl_context context, cl_command_queue queue, BufferOwningPtr
 
     if ( *error != CL_SUCCESS )
     {
+        long long unsigned imageSize = get_image_size_mb(imageInfo);
         switch (imageInfo->type)
         {
             case CL_MEM_OBJECT_IMAGE1D:
-                log_error( "ERROR: Unable to create 1D image of size %d (%s)", (int)imageInfo->width, IGetErrorString( *error ) );
+                log_error("ERROR: Unable to create 1D image of size %d (%llu "
+                          "MB):(%s)",
+                          (int)imageInfo->width, imageSize,
+                          IGetErrorString(*error));
                 break;
             case CL_MEM_OBJECT_IMAGE2D:
-                log_error( "ERROR: Unable to create 2D image of size %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->height, IGetErrorString( *error ) );
+                log_error("ERROR: Unable to create 2D image of size %d x %d "
+                          "(%llu MB):(%s)",
+                          (int)imageInfo->width, (int)imageInfo->height,
+                          imageSize, IGetErrorString(*error));
                 break;
             case CL_MEM_OBJECT_IMAGE3D:
-                log_error( "ERROR: Unable to create 3D image of size %d x %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, IGetErrorString( *error ) );
+                log_error("ERROR: Unable to create 3D image of size %d x %d x "
+                          "%d (%llu MB):(%s)",
+                          (int)imageInfo->width, (int)imageInfo->height,
+                          (int)imageInfo->depth, imageSize,
+                          IGetErrorString(*error));
                 break;
             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-                log_error( "ERROR: Unable to create 1D image array of size %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->arraySize, IGetErrorString( *error ) );
+                log_error("ERROR: Unable to create 1D image array of size %d x "
+                          "%d (%llu MB):(%s)",
+                          (int)imageInfo->width, (int)imageInfo->arraySize,
+                          imageSize, IGetErrorString(*error));
                 break;
                 break;
             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-                log_error( "ERROR: Unable to create 2D image array of size %d x %d x %d (%s)", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->arraySize, IGetErrorString( *error ) );
+                log_error("ERROR: Unable to create 2D image array of size %d x "
+                          "%d x %d (%llu MB):(%s)",
+                          (int)imageInfo->width, (int)imageInfo->height,
+                          (int)imageInfo->arraySize, imageSize,
+                          IGetErrorString(*error));
                 break;
         }
         log_error("ERROR: and %llu mip levels\n", (unsigned long long) imageInfo->num_mip_levels);
@@ -291,24 +309,7 @@ int test_copy_image_generic( cl_context context, cl_command_queue queue, image_d
     }
     else
     {
-        switch (srcImageInfo->type)
-        {
-            case CL_MEM_OBJECT_IMAGE1D:
-                srcBytes = srcImageInfo->rowPitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE2D:
-                srcBytes = srcImageInfo->height * srcImageInfo->rowPitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE3D:
-                srcBytes = srcImageInfo->depth * srcImageInfo->slicePitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-                srcBytes = srcImageInfo->arraySize * srcImageInfo->slicePitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-                srcBytes = srcImageInfo->arraySize * srcImageInfo->slicePitch;
-                break;
-        }
+        srcBytes = get_image_size(srcImageInfo);
     }
 
     if (srcBytes > srcData.getSize())
@@ -344,24 +345,7 @@ int test_copy_image_generic( cl_context context, cl_command_queue queue, image_d
     }
     else
     {
-        switch (dstImageInfo->type)
-        {
-            case CL_MEM_OBJECT_IMAGE1D:
-                destImageSize = dstImageInfo->rowPitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE2D:
-                destImageSize = dstImageInfo->height * dstImageInfo->rowPitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE3D:
-                destImageSize = dstImageInfo->depth * dstImageInfo->slicePitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-                destImageSize = dstImageInfo->arraySize * dstImageInfo->slicePitch;
-                break;
-            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-                destImageSize = dstImageInfo->arraySize * dstImageInfo->slicePitch;
-                break;
-        }
+        destImageSize = get_image_size(dstImageInfo);
     }
 
     if (destImageSize > dstData.getSize())
@@ -373,7 +357,11 @@ int test_copy_image_generic( cl_context context, cl_command_queue queue, image_d
             log_error( "ERROR: Unable to malloc %lu bytes for dstData\n", destImageSize );
             return -1;
         }
+    }
 
+    if (destImageSize > dstHost.getSize())
+    {
+        dstHost.reset(NULL);
         dstHost.reset(malloc(destImageSize),NULL,0,destImageSize);
         if (dstHost == NULL) {
             dstData.reset(NULL);
@@ -632,5 +620,14 @@ int test_copy_image_generic( cl_context context, cl_command_queue queue, image_d
         return error;
     }
 
+    // Ensure the unmap call completes.
+    error = clFinish(queue);
+    if (error != CL_SUCCESS)
+    {
+        log_error("ERROR: clFinish() failed to return CL_SUCCESS: %s\n",
+                  IGetErrorString(error));
+        return error;
+    }
+
     return 0;
 }

From bfca863cb8d9b95b50e13ff1fd1bc56b788745f8 Mon Sep 17 00:00:00 2001
From: Nikhil Joshi <nikhilj@nvidia.com>
Date: Fri, 29 Jan 2021 19:48:13 +0530
Subject: [PATCH 028/158] Handle NULL hostptr in conformance image tests
 (#1087)

* Fix enqueue_flags test to use correct barrier type.

Currently, enqueue_flags test uses CLK_LOCAL_MEM_FENCE.
Use CLK_GLOBAL_MEM_FENCE instead as all threads across work-groups
need to wait here.

* Add check for support for Read-Wrie images

Read-Write images have required OpenCL 2.x.
Read-Write image tests are already being skipped
for 1.x devices.
With OpenCL 3.0, read-write images being optional,
the tests should be run or skipped
depending on the implementation support.

Add a check to decide if Read-Write images are
supported or required to be supported depending
on OpenCL version and decide if the tests should
be run on skipped.

Fixes issue #894

* Fix formatting in case of Read-Write image checks.

Fix formatting in case of Read-write image checks.
Also, combine two ifs into one in case of
kerne_read_write tests

* Fix some more formatting for RW-image checks

Remove unnecessary spaces at various places.
Also, fix lengthy lines.

* Handle NULL hostptr in conformance image tests

As per the spec, clCreateBuffer and clCreateImage return
CL_INVALID_HOST_PTR if host_ptr is NULL
and CL_MEM_USE_HOST_PTR or CL_MEM_COPY_HOST_PTR are set in flags
or if host_ptr is not NULL
but CL_MEM_USE_HOST_PTR or CL_MEM_COPY_HOST_PTR are not set in flags."

Host pointer should be NULL when USE/COPY_HOST_PTR is not set in flags.

* Revert "Handle NULL hostptr in conformance image tests"

This reverts commit 49fa049f9b75d77d325bc08e74b7e3e2e32147a6.

* Move cl_mem_flag and host_ptr check to ImageHelper

Add a check to see if cl_mem_flag has
USE_HOST_PTR or COPY_HOST_PTR.
Override host_ptr with NULL if none of them
are specified.
---
 test_common/harness/clImageHelper.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/test_common/harness/clImageHelper.h b/test_common/harness/clImageHelper.h
index 45395fb078..3019ff341b 100644
--- a/test_common/harness/clImageHelper.h
+++ b/test_common/harness/clImageHelper.h
@@ -37,6 +37,11 @@ static inline cl_mem create_image_2d(cl_context context, cl_mem_flags flags,
 {
     cl_mem mImage = NULL;
 
+    if (!(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
+    {
+        host_ptr = NULL;
+    }
+
 #ifdef CL_VERSION_1_2
     cl_image_desc image_desc_dest;
     image_desc_dest.image_type = CL_MEM_OBJECT_IMAGE2D;
@@ -119,6 +124,11 @@ static inline cl_mem create_image_3d(cl_context context, cl_mem_flags flags,
 {
     cl_mem mImage;
 
+    if (!(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
+    {
+        host_ptr = NULL;
+    }
+
 #ifdef CL_VERSION_1_2
     cl_image_desc image_desc;
     image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
@@ -166,6 +176,11 @@ create_image_2d_array(cl_context context, cl_mem_flags flags,
 {
     cl_mem mImage;
 
+    if (!(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
+    {
+        host_ptr = NULL;
+    }
+
     cl_image_desc image_desc;
     image_desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
     image_desc.image_width = image_width;
@@ -196,6 +211,11 @@ static inline cl_mem create_image_1d_array(
 {
     cl_mem mImage;
 
+    if (!(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
+    {
+        host_ptr = NULL;
+    }
+
     cl_image_desc image_desc;
     image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
     image_desc.image_width = image_width;
@@ -227,6 +247,11 @@ static inline cl_mem create_image_1d(cl_context context, cl_mem_flags flags,
 {
     cl_mem mImage;
 
+    if (!(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)))
+    {
+        host_ptr = NULL;
+    }
+
     cl_image_desc image_desc;
     image_desc.image_type =
         buffer ? CL_MEM_OBJECT_IMAGE1D_BUFFER : CL_MEM_OBJECT_IMAGE1D;

From eb2287cc1b5ff582c524ff39b10b1f96b0689f82 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 29 Jan 2021 14:19:15 +0000
Subject: [PATCH 029/158] Remove unnecessary scope (#1126)

Reformat code using clang-format.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_two_results_i.cpp | 250 +++++++++---------
 1 file changed, 118 insertions(+), 132 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index 1130e93cb3..cfb42c82fa 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -504,163 +504,150 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                     && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
                     continue;
 
-                // if( t[j] != q[j] || t2[j] != q2[j] )
+                float test = ((float *)q)[j];
+                int correct2 = INT_MIN;
+                double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
+                float err = Ulp_Error(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
+                    || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j])
+                    || isnan(((float *)gIn)[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= float_ulps && iErr == 0);
+                if (ftz && fail)
                 {
-                    float test = ((float *)q)[j];
-                    int correct2 = INT_MIN;
-                    double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
-                    float err = Ulp_Error(test, correct);
-                    int64_t iErr;
-
-                    // in case of remquo, we only care about the sign and last
-                    // seven bits of integer as per the spec.
-                    if (testingRemquo)
-                        iErr = (long long)(q2[j] & 0x0000007f)
-                            - (long long)(correct2 & 0x0000007f);
-                    else
-                        iErr = (long long)q2[j] - (long long)correct2;
-
-                    // For remquo, if y = 0, x is infinite, or either is NaN
-                    // then the standard either neglects to say what is returned
-                    // in iptr or leaves it undefined or implementation defined.
-                    int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
-                        || ((float *)gIn2)[j] == 0.0f
-                        || isnan(((float *)gIn2)[j])
-                        || isnan(((float *)gIn)[j]);
-                    if (iptrUndefined) iErr = 0;
-
-                    int fail = !(fabsf(err) <= float_ulps && iErr == 0);
-                    if (ftz && fail)
+                    // retry per section 6.5.3.2
+                    if (IsFloatResultSubnormal(correct, float_ulps))
                     {
-                        // retry per section 6.5.3.2
-                        if (IsFloatResultSubnormal(correct, float_ulps))
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsFloatSubnormal(s[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 =
+                            f->func.f_ffpI(0.0, s2[j], &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(-0.0, s2[j], &correct4i);
+                        float err2 = Ulp_Error(test, correct3);
+                        float err3 = Ulp_Error(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= float_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, float_ulps)
+                            || IsFloatResultSubnormal(correct3, float_ulps))
                         {
-                            fail = fail && !(test == 0.0f && iErr == 0);
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
                             if (!fail) err = 0.0f;
                         }
 
-                        // retry per section 6.5.3.3
-                        if (IsFloatSubnormal(s[j]))
+                        // try with both args as zero
+                        if (IsFloatSubnormal(s2[j]))
                         {
-                            int correct3i, correct4i;
-                            double correct3 =
-                                f->func.f_ffpI(0.0, s2[j], &correct3i);
-                            double correct4 =
-                                f->func.f_ffpI(-0.0, s2[j], &correct4i);
-                            float err2 = Ulp_Error(test, correct3);
-                            float err3 = Ulp_Error(test, correct4);
-                            int64_t iErr3 =
-                                (long long)q2[j] - (long long)correct3i;
-                            int64_t iErr4 =
-                                (long long)q2[j] - (long long)correct4i;
+                            int correct7i, correct8i;
+                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                            double correct7 =
+                                f->func.f_ffpI(0.0, -0.0, &correct7i);
+                            double correct8 =
+                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Ulp_Error(test, correct3);
+                            err3 = Ulp_Error(test, correct4);
+                            float err4 = Ulp_Error(test, correct7);
+                            float err5 = Ulp_Error(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
                             fail = fail
                                 && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
                                     && (!(fabsf(err3) <= float_ulps
-                                          && iErr4 == 0)));
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= float_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= float_ulps
+                                          && iErr8 == 0)));
                             if (fabsf(err2) < fabsf(err)) err = err2;
                             if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
                             if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
                             if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
 
                             // retry per section 6.5.3.4
-                            if (IsFloatResultSubnormal(correct2, float_ulps)
-                                || IsFloatResultSubnormal(correct3, float_ulps))
+                            if (IsFloatResultSubnormal(correct3, float_ulps)
+                                || IsFloatResultSubnormal(correct4, float_ulps)
+                                || IsFloatResultSubnormal(correct7, float_ulps)
+                                || IsFloatResultSubnormal(correct8, float_ulps))
                             {
                                 fail = fail
                                     && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0));
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
                                 if (!fail) err = 0.0f;
                             }
-
-                            // try with both args as zero
-                            if (IsFloatSubnormal(s2[j]))
-                            {
-                                int correct7i, correct8i;
-                                correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
-                                correct4 =
-                                    f->func.f_ffpI(-0.0, 0.0, &correct4i);
-                                double correct7 =
-                                    f->func.f_ffpI(0.0, -0.0, &correct7i);
-                                double correct8 =
-                                    f->func.f_ffpI(-0.0, -0.0, &correct8i);
-                                err2 = Ulp_Error(test, correct3);
-                                err3 = Ulp_Error(test, correct4);
-                                float err4 = Ulp_Error(test, correct7);
-                                float err5 = Ulp_Error(test, correct8);
-                                iErr3 = (long long)q2[j] - (long long)correct3i;
-                                iErr4 = (long long)q2[j] - (long long)correct4i;
-                                int64_t iErr7 =
-                                    (long long)q2[j] - (long long)correct7i;
-                                int64_t iErr8 =
-                                    (long long)q2[j] - (long long)correct8i;
-                                fail = fail
-                                    && ((!(fabsf(err2) <= float_ulps
-                                           && iErr3 == 0))
-                                        && (!(fabsf(err3) <= float_ulps
-                                              && iErr4 == 0))
-                                        && (!(fabsf(err4) <= float_ulps
-                                              && iErr7 == 0))
-                                        && (!(fabsf(err5) <= float_ulps
-                                              && iErr8 == 0)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-                                if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                                if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-                                if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
-                                if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
-
-                                // retry per section 6.5.3.4
-                                if (IsFloatResultSubnormal(correct3, float_ulps)
-                                    || IsFloatResultSubnormal(correct4,
-                                                              float_ulps)
-                                    || IsFloatResultSubnormal(correct7,
-                                                              float_ulps)
-                                    || IsFloatResultSubnormal(correct8,
-                                                              float_ulps))
-                                {
-                                    fail = fail
-                                        && !(test == 0.0f
-                                             && (iErr3 == 0 || iErr4 == 0
-                                                 || iErr7 == 0 || iErr8 == 0));
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
                         }
-                        else if (IsFloatSubnormal(s2[j]))
+                    }
+                    else if (IsFloatSubnormal(s2[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(s[j], -0.0, &correct4i);
+                        float err2 = Ulp_Error(test, correct3);
+                        float err3 = Ulp_Error(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= float_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, float_ulps)
+                            || IsFloatResultSubnormal(correct3, float_ulps))
                         {
-                            int correct3i, correct4i;
-                            double correct3 =
-                                f->func.f_ffpI(s[j], 0.0, &correct3i);
-                            double correct4 =
-                                f->func.f_ffpI(s[j], -0.0, &correct4i);
-                            float err2 = Ulp_Error(test, correct3);
-                            float err3 = Ulp_Error(test, correct4);
-                            int64_t iErr3 =
-                                (long long)q2[j] - (long long)correct3i;
-                            int64_t iErr4 =
-                                (long long)q2[j] - (long long)correct4i;
                             fail = fail
-                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
-                                    && (!(fabsf(err3) <= float_ulps
-                                          && iErr4 == 0)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                            // retry per section 6.5.3.4
-                            if (IsFloatResultSubnormal(correct2, float_ulps)
-                                || IsFloatResultSubnormal(correct3, float_ulps))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0));
-                                if (!fail) err = 0.0f;
-                            }
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
                         }
                     }
+                }
                     if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
@@ -687,7 +674,6 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                         error = -1;
                         goto exit;
                     }
-                }
             }
         }
 

From 545e4d03098400bff3da32ae7c412911cdf7d606 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 29 Jan 2021 14:20:39 +0000
Subject: [PATCH 030/158] Make functions local to translation unit (#1125)

Remove unnecessary declarations.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary.cpp  | 19 +++++--------------
 .../math_brute_force/binaryOperator.cpp       | 13 -------------
 .../math_brute_force/binary_i.cpp             |  9 ---------
 .../math_brute_force/binary_two_results_i.cpp |  8 --------
 test_conformance/math_brute_force/i_unary.cpp |  8 --------
 .../math_brute_force/macro_binary.cpp         |  9 ---------
 .../math_brute_force/macro_unary.cpp          |  9 ---------
 test_conformance/math_brute_force/mad.cpp     |  8 --------
 test_conformance/math_brute_force/ternary.cpp |  8 --------
 test_conformance/math_brute_force/unary.cpp   | 10 ----------
 .../math_brute_force/unary_two_results.cpp    |  8 --------
 .../math_brute_force/unary_two_results_i.cpp  | 12 +-----------
 test_conformance/math_brute_force/unary_u.cpp | 10 ----------
 13 files changed, 6 insertions(+), 125 deletions(-)

diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index db961c8d77..1784c725d9 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -24,10 +24,6 @@ int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata,
                                          bool relaxedMode);
 int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata,
                                             bool relaxedMode);
-int TestFunc_Float_Float_Float_common(const Func *f, MTdata, int isNextafter,
-                                      bool relaxedMode);
-int TestFunc_Double_Double_Double_common(const Func *f, MTdata, int isNextafter,
-                                         bool relaxedMode);
 
 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
@@ -40,8 +36,6 @@ extern const vtbl _binary_nextafter = {
     TestFunc_Double_Double_Double_nextafter
 };
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -330,8 +324,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -341,8 +333,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -394,8 +384,8 @@ typedef struct TestInfo
 
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
-int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
-                                      bool relaxedMode)
+static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
+                                             int isNextafter, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
@@ -1336,8 +1326,9 @@ static size_t specialValuesDoubleCount =
 
 static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
-int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
-                                         int isNextafter, bool relaxedMode)
+static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
+                                                int isNextafter,
+                                                bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index f6ba838a6d..bd1a3143c0 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -27,13 +27,6 @@ extern const vtbl _binary_operator = { "binaryOperator",
                                        TestFunc_Float_Float_Float_Operator,
                                        TestFunc_Double_Double_Double_Operator };
 
-static int BuildKernel(const char *name, const char *operator_symbol,
-                       int vectorSize, cl_uint kernel_count, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, const char *operator_symbol,
-                             int vectorSize, cl_uint kernel_count, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
-
 static int BuildKernel(const char *name, const char *operator_symbol,
                        int vectorSize, cl_uint kernel_count, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
@@ -223,8 +216,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -234,8 +225,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -283,8 +272,6 @@ typedef struct TestInfo
     // no special fields
 } TestInfo;
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index dc6feb8c82..a8535281b5 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -25,11 +25,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata, bool relaxedMode);
 extern const vtbl _binary_i = { "binary_i", TestFunc_Float_Float_Int,
                                 TestFunc_Double_Double_Int };
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -214,8 +209,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -225,8 +218,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index cfb42c82fa..a0aa9d2503 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -26,10 +26,6 @@ extern const vtbl _binary_two_results_i = { "binary_two_results_i",
                                             TestFunc_FloatI_Float_Float,
                                             TestFunc_DoubleI_Double_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
@@ -224,8 +220,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -235,8 +229,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index 5b8b42439a..7f2f79a3f8 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -25,10 +25,6 @@ extern const vtbl _i_unary = { "i_unary", TestFunc_Int_Float,
                                TestFunc_Int_Double };
 
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
@@ -198,8 +194,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -209,8 +203,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index 52c4e96cd7..b0b8214956 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -24,11 +24,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata, bool relaxedMode);
 extern const vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float,
                                     TestMacro_Int_Double_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p);
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -214,8 +209,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -225,8 +218,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index 26a186f640..bf08a17081 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -24,11 +24,6 @@ int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode);
 extern const vtbl _macro_unary = { "macro_unary", TestMacro_Int_Float,
                                    TestMacro_Int_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -202,8 +197,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -213,8 +206,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index a510e5b118..fb144e4b8d 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -23,10 +23,6 @@ int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode);
 
 extern const vtbl _mad_tbl = { "ternary", TestFunc_mad, TestFunc_mad_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
@@ -215,8 +211,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -226,8 +220,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index 015dbc25db..b3eea0f4da 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -28,10 +28,6 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata,
 extern const vtbl _ternary = { "ternary", TestFunc_Float_Float_Float_Float,
                                TestFunc_Double_Double_Double_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
@@ -221,8 +217,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -232,8 +226,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 61a8546b40..ae3f54e808 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -28,12 +28,6 @@ int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode);
 extern const vtbl _unary = { "unary", TestFunc_Float_Float,
                              TestFunc_Double_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode);
-
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
@@ -206,8 +200,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -217,8 +209,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index 177239adb1..77d40b0d46 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -25,10 +25,6 @@ extern const vtbl _unary_two_results = { "unary_two_results",
                                          TestFunc_Float2_Float,
                                          TestFunc_Double2_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
@@ -208,8 +204,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -219,8 +213,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index b79f91a2fd..f3c73434b8 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -26,11 +26,6 @@ extern const vtbl _unary_two_results_i = { "unary_two_results_i",
                                            TestFunc_FloatI_Float,
                                            TestFunc_DoubleI_Double };
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
@@ -208,8 +203,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -219,8 +212,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -230,8 +221,7 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-cl_ulong abs_cl_long(cl_long i);
-cl_ulong abs_cl_long(cl_long i)
+static cl_ulong abs_cl_long(cl_long i)
 {
     cl_long mask = i >> 63;
     return (i ^ mask) - mask;
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 43e997ef8c..53f5db384e 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -24,12 +24,6 @@ int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode);
 extern const vtbl _unary_u = { "unary_u", TestFunc_Float_UInt,
                                TestFunc_Double_ULong };
 
-
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode);
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode);
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
@@ -198,8 +192,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p);
 static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                   void *p)
 {
@@ -209,8 +201,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p);
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {

From 4abea6f761f6320e420b815352b2a7d7bc850c0a Mon Sep 17 00:00:00 2001
From: Stephen <zsrkmyn@gmail.com>
Date: Mon, 1 Feb 2021 17:23:10 +0800
Subject: [PATCH 031/158] Fix literal integer types for atomics tests (#1059)

* Fix literal integer types for atomics tests

* [NFC] Format previous changes
---
 test_conformance/atomics/test_atomics.cpp | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/test_conformance/atomics/test_atomics.cpp b/test_conformance/atomics/test_atomics.cpp
index 5f4c0943b9..34b34ed38a 100644
--- a/test_conformance/atomics/test_atomics.cpp
+++ b/test_conformance/atomics/test_atomics.cpp
@@ -1004,8 +1004,7 @@ cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_
     // Last item doesn't get and'ed on every bit, so we have to mask away
     size_t numBits = (size_t)size - whichResult * 64;
     cl_long bits = (cl_long)0xffffffffffffffffLL;
-    for( size_t i = 0; i < numBits; i++ )
-        bits &= ~( 1 << i );
+    for (size_t i = 0; i < numBits; i++) bits &= ~(1LL << i);
 
     return bits;
 }
@@ -1086,18 +1085,16 @@ int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue q
 #pragma mark ---- xor
 
 const char atom_xor_core[] =
-"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
-"    int  bitIndex = tid & ( numBits - 1 );\n"
-"\n"
-"    oldValues[tid] = atom_xor( &destMemory[0], 1 << bitIndex );\n"
-;
+    "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
+    "    int  bitIndex = tid & ( numBits - 1 );\n"
+    "\n"
+    "    oldValues[tid] = atom_xor( &destMemory[0], 1L << bitIndex );\n";
 
 const char atomic_xor_core[] =
-"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
-"    int  bitIndex = tid & ( numBits - 1 );\n"
-"\n"
-"    oldValues[tid] = atomic_xor( &destMemory[0], 1 << bitIndex );\n"
-;
+    "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
+    "    int  bitIndex = tid & ( numBits - 1 );\n"
+    "\n"
+    "    oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n";
 
 cl_int test_atomic_xor_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
 {

From ca673af48897b179956a86db108af1aa881b220a Mon Sep 17 00:00:00 2001
From: ellnor01 <51320439+ellnor01@users.noreply.github.com>
Date: Tue, 2 Feb 2021 08:56:00 +0000
Subject: [PATCH 032/158] First steps in tidying image/kernel_read_write tests
 (#1121)

* Move common global variable and functions to header

InitFloatCoords for 3D read images has also been renamed
so it can later be used as a common function

Contributes #616

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>

* Set-up for using 3D functions as a base

test_read_image_3D had been moved to common.cpp (and renamed
test_read_image) along with corresponding
determine_validation_error_offset and InitFloatCoords.

Only function names and the formatting have been changed.

Contributes #616

Signed-off-by: Ellen Norris-Thompson <ellen.norris-thompson@arm.com>
---
 .../images/kernel_read_write/test_common.cpp  | 1527 +++++++++++++++++
 .../images/kernel_read_write/test_common.h    |  215 +++
 .../kernel_read_write/test_iterations.cpp     |    7 +-
 .../images/kernel_read_write/test_read_1D.cpp |    8 +-
 .../kernel_read_write/test_read_1D_array.cpp  |   11 +-
 .../kernel_read_write/test_read_2D_array.cpp  |    6 +-
 .../images/kernel_read_write/test_read_3D.cpp | 1102 +-----------
 7 files changed, 1759 insertions(+), 1117 deletions(-)

diff --git a/test_conformance/images/kernel_read_write/test_common.cpp b/test_conformance/images/kernel_read_write/test_common.cpp
index 5182601b86..e76710b577 100644
--- a/test_conformance/images/kernel_read_write/test_common.cpp
+++ b/test_conformance/images/kernel_read_write/test_common.cpp
@@ -1,6 +1,22 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "test_common.h"
 
+
 cl_sampler create_sampler(cl_context context, image_sampler_data *sdata, bool test_mipmaps, cl_int *error) {
     cl_sampler sampler = nullptr;
     if (test_mipmaps) {
@@ -17,3 +33,1514 @@ cl_sampler create_sampler(cl_context context, image_sampler_data *sdata, bool te
     return sampler;
 }
 
+void InitFloatCoordsCommon(image_descriptor *imageInfo,
+                           image_sampler_data *imageSampler, float *xOffsets,
+                           float *yOffsets, float *zOffsets, float xfract,
+                           float yfract, float zfract, int normalized_coords,
+                           MTdata d, int lod)
+{
+    size_t i = 0;
+    if (gDisableOffsets)
+    {
+        for (size_t z = 0; z < imageInfo->depth; z++)
+        {
+            for (size_t y = 0; y < imageInfo->height; y++)
+            {
+                for (size_t x = 0; x < imageInfo->width; x++, i++)
+                {
+                    xOffsets[i] = (float)(xfract + (double)x);
+                    yOffsets[i] = (float)(yfract + (double)y);
+                    zOffsets[i] = (float)(zfract + (double)z);
+                }
+            }
+        }
+    }
+    else
+    {
+        for (size_t z = 0; z < imageInfo->depth; z++)
+        {
+            for (size_t y = 0; y < imageInfo->height; y++)
+            {
+                for (size_t x = 0; x < imageInfo->width; x++, i++)
+                {
+                    xOffsets[i] =
+                        (float)(xfract
+                                + (double)((int)x
+                                           + random_in_range(-10, 10, d)));
+                    yOffsets[i] =
+                        (float)(yfract
+                                + (double)((int)y
+                                           + random_in_range(-10, 10, d)));
+                    zOffsets[i] =
+                        (float)(zfract
+                                + (double)((int)z
+                                           + random_in_range(-10, 10, d)));
+                }
+            }
+        }
+    }
+
+    if (imageSampler->addressing_mode == CL_ADDRESS_NONE)
+    {
+        i = 0;
+        for (size_t z = 0; z < imageInfo->depth; z++)
+        {
+            for (size_t y = 0; y < imageInfo->height; y++)
+            {
+                for (size_t x = 0; x < imageInfo->width; x++, i++)
+                {
+                    xOffsets[i] = (float)CLAMP((double)xOffsets[i], 0.0,
+                                               (double)imageInfo->width - 1.0);
+                    yOffsets[i] = (float)CLAMP((double)yOffsets[i], 0.0,
+                                               (double)imageInfo->height - 1.0);
+                    zOffsets[i] = (float)CLAMP((double)zOffsets[i], 0.0,
+                                               (double)imageInfo->depth - 1.0);
+                }
+            }
+        }
+    }
+
+    if (normalized_coords || gTestMipmaps)
+    {
+        i = 0;
+        if (lod == 0)
+        {
+            for (size_t z = 0; z < imageInfo->depth; z++)
+            {
+                for (size_t y = 0; y < imageInfo->height; y++)
+                {
+                    for (size_t x = 0; x < imageInfo->width; x++, i++)
+                    {
+                        xOffsets[i] = (float)((double)xOffsets[i]
+                                              / (double)imageInfo->width);
+                        yOffsets[i] = (float)((double)yOffsets[i]
+                                              / (double)imageInfo->height);
+                        zOffsets[i] = (float)((double)zOffsets[i]
+                                              / (double)imageInfo->depth);
+                    }
+                }
+            }
+        }
+        else if (gTestMipmaps)
+        {
+            size_t width_lod, height_lod, depth_lod;
+
+            width_lod =
+                (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
+            height_lod =
+                (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
+            depth_lod =
+                (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
+
+            for (size_t z = 0; z < depth_lod; z++)
+            {
+                for (size_t y = 0; y < height_lod; y++)
+                {
+                    for (size_t x = 0; x < width_lod; x++, i++)
+                    {
+                        xOffsets[i] =
+                            (float)((double)xOffsets[i] / (double)width_lod);
+                        yOffsets[i] =
+                            (float)((double)yOffsets[i] / (double)height_lod);
+                        zOffsets[i] =
+                            (float)((double)zOffsets[i] / (double)depth_lod);
+                    }
+                }
+            }
+        }
+    }
+}
+
+int test_read_image(cl_context context, cl_command_queue queue,
+                    cl_kernel kernel, image_descriptor *imageInfo,
+                    image_sampler_data *imageSampler, bool useFloatCoords,
+                    ExplicitType outputType, MTdata d)
+{
+    int error;
+    size_t threads[3];
+    static int initHalf = 0;
+
+    cl_mem_flags image_read_write_flags = CL_MEM_READ_ONLY;
+
+    clMemWrapper xOffsets, yOffsets, zOffsets, results;
+    clSamplerWrapper actualSampler;
+    BufferOwningPtr<char> maxImageUseHostPtrBackingStore;
+
+    // Create offset data
+    BufferOwningPtr<cl_float> xOffsetValues(
+        malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height
+               * imageInfo->depth));
+    BufferOwningPtr<cl_float> yOffsetValues(
+        malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height
+               * imageInfo->depth));
+    BufferOwningPtr<cl_float> zOffsetValues(
+        malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height
+               * imageInfo->depth));
+
+    if (imageInfo->format->image_channel_data_type == CL_HALF_FLOAT)
+        if (DetectFloatToHalfRoundingMode(queue)) return 1;
+
+    BufferOwningPtr<char> imageValues;
+    generate_random_image_data(imageInfo, imageValues, d);
+
+    // Construct testing sources
+    clProtectedImage protImage;
+    clMemWrapper unprotImage;
+    cl_mem image;
+
+    if (gtestTypesToRun & kReadTests)
+    {
+        image_read_write_flags = CL_MEM_READ_ONLY;
+    }
+    else
+    {
+        image_read_write_flags = CL_MEM_READ_WRITE;
+    }
+
+    if (gMemFlagsToUse == CL_MEM_USE_HOST_PTR)
+    {
+        // clProtectedImage uses USE_HOST_PTR, so just rely on that for the
+        // testing (via Ian) Do not use protected images for max image size test
+        // since it rounds the row size to a page size
+        if (gTestMaxImages)
+        {
+            generate_random_image_data(imageInfo,
+                                       maxImageUseHostPtrBackingStore, d);
+            unprotImage = create_image_3d(
+                context, image_read_write_flags | CL_MEM_USE_HOST_PTR,
+                imageInfo->format, imageInfo->width, imageInfo->height,
+                imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0),
+                (gEnablePitch ? imageInfo->slicePitch : 0),
+                maxImageUseHostPtrBackingStore, &error);
+        }
+        else
+        {
+            error = protImage.Create(context, image_read_write_flags,
+                                     imageInfo->format, imageInfo->width,
+                                     imageInfo->height, imageInfo->depth);
+        }
+        if (error != CL_SUCCESS)
+        {
+            log_error("ERROR: Unable to create 3D image of size %d x %d x %d "
+                      "(pitch %d, %d ) (%s)",
+                      (int)imageInfo->width, (int)imageInfo->height,
+                      (int)imageInfo->depth, (int)imageInfo->rowPitch,
+                      (int)imageInfo->slicePitch, IGetErrorString(error));
+            return error;
+        }
+        if (gTestMaxImages)
+            image = (cl_mem)unprotImage;
+        else
+            image = (cl_mem)protImage;
+    }
+    else if (gMemFlagsToUse == CL_MEM_COPY_HOST_PTR)
+    {
+        // Don't use clEnqueueWriteImage; just use copy host ptr to get the data
+        // in
+        unprotImage = create_image_3d(
+            context, image_read_write_flags | CL_MEM_COPY_HOST_PTR,
+            imageInfo->format, imageInfo->width, imageInfo->height,
+            imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0),
+            (gEnablePitch ? imageInfo->slicePitch : 0), imageValues, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("ERROR: Unable to create 3D image of size %d x %d x %d "
+                      "(pitch %d, %d ) (%s)",
+                      (int)imageInfo->width, (int)imageInfo->height,
+                      (int)imageInfo->depth, (int)imageInfo->rowPitch,
+                      (int)imageInfo->slicePitch, IGetErrorString(error));
+            return error;
+        }
+        image = unprotImage;
+    }
+    else // Either CL_MEM_ALLOC_HOST_PTR or none
+    {
+        // Note: if ALLOC_HOST_PTR is used, the driver allocates memory that can
+        // be accessed by the host, but otherwise it works just as if no flag is
+        // specified, so we just do the same thing either way
+        if (!gTestMipmaps)
+        {
+            unprotImage = create_image_3d(
+                context, image_read_write_flags | gMemFlagsToUse,
+                imageInfo->format, imageInfo->width, imageInfo->height,
+                imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0),
+                (gEnablePitch ? imageInfo->slicePitch : 0), imageValues,
+                &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error("ERROR: Unable to create 3D image of size %d x %d x "
+                          "%d (pitch %d, %d ) (%s)",
+                          (int)imageInfo->width, (int)imageInfo->height,
+                          (int)imageInfo->depth, (int)imageInfo->rowPitch,
+                          (int)imageInfo->slicePitch, IGetErrorString(error));
+                return error;
+            }
+            image = unprotImage;
+        }
+        else
+        {
+            cl_image_desc image_desc = { 0 };
+            image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+            image_desc.image_width = imageInfo->width;
+            image_desc.image_height = imageInfo->height;
+            image_desc.image_depth = imageInfo->depth;
+            image_desc.num_mip_levels = imageInfo->num_mip_levels;
+
+
+            unprotImage =
+                clCreateImage(context, image_read_write_flags,
+                              imageInfo->format, &image_desc, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error("ERROR: Unable to create %d level mipmapped 3D image "
+                          "of size %d x %d x %d (pitch %d, %d ) (%s)",
+                          (int)imageInfo->num_mip_levels, (int)imageInfo->width,
+                          (int)imageInfo->height, (int)imageInfo->depth,
+                          (int)imageInfo->rowPitch, (int)imageInfo->slicePitch,
+                          IGetErrorString(error));
+                return error;
+            }
+            image = unprotImage;
+        }
+    }
+
+    if (gMemFlagsToUse != CL_MEM_COPY_HOST_PTR)
+    {
+        size_t origin[4] = { 0, 0, 0, 0 };
+        size_t region[3] = { imageInfo->width, imageInfo->height,
+                             imageInfo->depth };
+
+        if (gDebugTrace) log_info(" - Writing image...\n");
+
+        if (!gTestMipmaps)
+        {
+
+            error =
+                clEnqueueWriteImage(queue, image, CL_TRUE, origin, region,
+                                    gEnablePitch ? imageInfo->rowPitch : 0,
+                                    gEnablePitch ? imageInfo->slicePitch : 0,
+                                    imageValues, 0, NULL, NULL);
+
+            if (error != CL_SUCCESS)
+            {
+                log_error("ERROR: Unable to write to 3D image of size %d x %d "
+                          "x %d \n",
+                          (int)imageInfo->width, (int)imageInfo->height,
+                          (int)imageInfo->depth);
+                return error;
+            }
+        }
+        else
+        {
+            int nextLevelOffset = 0;
+
+            for (int i = 0; i < imageInfo->num_mip_levels; i++)
+            {
+                origin[3] = i;
+                error = clEnqueueWriteImage(
+                    queue, image, CL_TRUE, origin, region,
+                    /*gEnablePitch ? imageInfo->rowPitch :*/ 0,
+                    /*gEnablePitch ? imageInfo->slicePitch :*/ 0,
+                    ((char *)imageValues + nextLevelOffset), 0, NULL, NULL);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("ERROR: Unable to write to %d level mipmapped 3D "
+                              "image of size %d x %d x %d\n",
+                              (int)imageInfo->num_mip_levels,
+                              (int)imageInfo->width, (int)imageInfo->height,
+                              (int)imageInfo->depth);
+                    return error;
+                }
+                nextLevelOffset += region[0] * region[1] * region[2]
+                    * get_pixel_size(imageInfo->format);
+                // Subsequent mip level dimensions keep halving
+                region[0] = region[0] >> 1 ? region[0] >> 1 : 1;
+                region[1] = region[1] >> 1 ? region[1] >> 1 : 1;
+                region[2] = region[2] >> 1 ? region[2] >> 1 : 1;
+            }
+        }
+    }
+
+    xOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                              sizeof(cl_float) * imageInfo->width
+                                  * imageInfo->height * imageInfo->depth,
+                              xOffsetValues, &error);
+    test_error(error, "Unable to create x offset buffer");
+    yOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                              sizeof(cl_float) * imageInfo->width
+                                  * imageInfo->height * imageInfo->depth,
+                              yOffsetValues, &error);
+    test_error(error, "Unable to create y offset buffer");
+    zOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                              sizeof(cl_float) * imageInfo->width
+                                  * imageInfo->height * imageInfo->depth,
+                              zOffsetValues, &error);
+    test_error(error, "Unable to create y offset buffer");
+    results =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       get_explicit_type_size(outputType) * 4 * imageInfo->width
+                           * imageInfo->height * imageInfo->depth,
+                       NULL, &error);
+    test_error(error, "Unable to create result buffer");
+
+    // Create sampler to use
+    actualSampler = create_sampler(context, imageSampler, gTestMipmaps, &error);
+    test_error(error, "Unable to create image sampler");
+
+    // Set arguments
+    int idx = 0;
+    error = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &image);
+    test_error(error, "Unable to set kernel arguments");
+    if (!gUseKernelSamplers)
+    {
+        error =
+            clSetKernelArg(kernel, idx++, sizeof(cl_sampler), &actualSampler);
+        test_error(error, "Unable to set kernel arguments");
+    }
+    error = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &xOffsets);
+    test_error(error, "Unable to set kernel arguments");
+    error = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &yOffsets);
+    test_error(error, "Unable to set kernel arguments");
+    error = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &zOffsets);
+    test_error(error, "Unable to set kernel arguments");
+    error = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &results);
+    test_error(error, "Unable to set kernel arguments");
+
+    const float float_offsets[] = { 0.0f,
+                                    MAKE_HEX_FLOAT(0x1.0p-30f, 0x1L, -30),
+                                    0.25f,
+                                    0.3f,
+                                    0.5f - FLT_EPSILON / 4.0f,
+                                    0.5f,
+                                    0.9f,
+                                    1.0f - FLT_EPSILON / 2 };
+    int float_offset_count = sizeof(float_offsets) / sizeof(float_offsets[0]);
+    int numTries = MAX_TRIES, numClamped = MAX_CLAMPED;
+    int loopCount = 2 * float_offset_count;
+    if (!useFloatCoords) loopCount = 1;
+    if (gTestMaxImages)
+    {
+        loopCount = 1;
+        log_info("Testing each size only once with pixel offsets of %g for max "
+                 "sized images.\n",
+                 float_offsets[0]);
+    }
+
+    // Get the maximum absolute error for this format
+    double formatAbsoluteError =
+        get_max_absolute_error(imageInfo->format, imageSampler);
+    if (gDebugTrace)
+        log_info("\tformatAbsoluteError is %e\n", formatAbsoluteError);
+
+    if (0 == initHalf
+        && imageInfo->format->image_channel_data_type == CL_HALF_FLOAT)
+    {
+        initHalf = CL_SUCCESS == DetectFloatToHalfRoundingMode(queue);
+        if (initHalf)
+        {
+            log_info("Half rounding mode successfully detected.\n");
+        }
+    }
+
+    int nextLevelOffset = 0;
+    size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
+           depth_lod = imageInfo->depth;
+
+    // Loop over all mipmap levels, if we are testing mipmapped images.
+    for (int lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels)
+         || (!gTestMipmaps && lod < 1);
+         lod++)
+    {
+        size_t resultValuesSize = width_lod * height_lod * depth_lod
+            * get_explicit_type_size(outputType) * 4;
+        BufferOwningPtr<char> resultValues(malloc(resultValuesSize));
+        float lod_float = (float)lod;
+        if (gTestMipmaps)
+        {
+            // Set the lod kernel arg
+            if (gDebugTrace) log_info(" - Working at mip level %d\n", lod);
+            error = clSetKernelArg(kernel, idx, sizeof(float), &lod_float);
+            test_error(error, "Unable to set kernel arguments");
+        }
+
+        for (int q = 0; q < loopCount; q++)
+        {
+            float offset = float_offsets[q % float_offset_count];
+
+            // Init the coordinates
+            InitFloatCoordsCommon(imageInfo, imageSampler, xOffsetValues,
+                                  yOffsetValues, zOffsetValues,
+                                  q >= float_offset_count ? -offset : offset,
+                                  q >= float_offset_count ? offset : -offset,
+                                  q >= float_offset_count ? -offset : offset,
+                                  imageSampler->normalized_coords, d, lod);
+
+            error =
+                clEnqueueWriteBuffer(queue, xOffsets, CL_TRUE, 0,
+                                     sizeof(cl_float) * imageInfo->height
+                                         * imageInfo->width * imageInfo->depth,
+                                     xOffsetValues, 0, NULL, NULL);
+            test_error(error, "Unable to write x offsets");
+            error =
+                clEnqueueWriteBuffer(queue, yOffsets, CL_TRUE, 0,
+                                     sizeof(cl_float) * imageInfo->height
+                                         * imageInfo->width * imageInfo->depth,
+                                     yOffsetValues, 0, NULL, NULL);
+            test_error(error, "Unable to write y offsets");
+            error =
+                clEnqueueWriteBuffer(queue, zOffsets, CL_TRUE, 0,
+                                     sizeof(cl_float) * imageInfo->height
+                                         * imageInfo->width * imageInfo->depth,
+                                     zOffsetValues, 0, NULL, NULL);
+            test_error(error, "Unable to write z offsets");
+
+
+            memset(resultValues, 0xff, resultValuesSize);
+            clEnqueueWriteBuffer(queue, results, CL_TRUE, 0, resultValuesSize,
+                                 resultValues, 0, NULL, NULL);
+
+            // Figure out thread dimensions
+            threads[0] = (size_t)width_lod;
+            threads[1] = (size_t)height_lod;
+            threads[2] = (size_t)depth_lod;
+
+            // Run the kernel
+            error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, threads,
+                                           NULL, 0, NULL, NULL);
+            test_error(error, "Unable to run kernel");
+
+            // Get results
+            error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0,
+                                        width_lod * height_lod * depth_lod
+                                            * get_explicit_type_size(outputType)
+                                            * 4,
+                                        resultValues, 0, NULL, NULL);
+            test_error(error, "Unable to read results from kernel");
+            if (gDebugTrace) log_info("    results read\n");
+
+            // Validate results element by element
+            char *imagePtr = (char *)imageValues + nextLevelOffset;
+            /*
+             * FLOAT output type
+             */
+            if (is_sRGBA_order(imageInfo->format->image_channel_order)
+                && (outputType == kFloat))
+            {
+                // Validate float results
+                float *resultPtr = (float *)(char *)resultValues;
+                float expected[4], error = 0.0f;
+                float maxErr = get_max_relative_error(
+                    imageInfo->format, imageSampler, 1 /*3D*/,
+                    CL_FILTER_LINEAR == imageSampler->filter_mode);
+
+                for (size_t z = 0, j = 0; z < depth_lod; z++)
+                {
+                    for (size_t y = 0; y < height_lod; y++)
+                    {
+                        for (size_t x = 0; x < width_lod; x++, j++)
+                        {
+                            // Step 1: go through and see if the results verify
+                            // for the pixel For the normalized case on a GPU we
+                            // put in offsets to the X, Y and Z to see if we
+                            // land on the right pixel. This addresses the
+                            // significant inaccuracy in GPU normalization in
+                            // OpenCL 1.0.
+                            int checkOnlyOnePixel = 0;
+                            int found_pixel = 0;
+                            float offset = NORM_OFFSET;
+                            if (!imageSampler->normalized_coords
+                                || imageSampler->filter_mode
+                                    != CL_FILTER_NEAREST
+                                || NORM_OFFSET == 0
+#if defined(__APPLE__)
+                                // Apple requires its CPU implementation to do
+                                // correctly rounded address arithmetic in all
+                                // modes
+                                || gDeviceType != CL_DEVICE_TYPE_GPU
+#endif
+                            )
+                                offset = 0.0f; // Loop only once
+
+                            for (float norm_offset_x = -offset;
+                                 norm_offset_x <= offset && !found_pixel;
+                                 norm_offset_x += NORM_OFFSET)
+                            {
+                                for (float norm_offset_y = -offset;
+                                     norm_offset_y <= offset && !found_pixel;
+                                     norm_offset_y += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_z = -offset;
+                                         norm_offset_z <= NORM_OFFSET
+                                         && !found_pixel;
+                                         norm_offset_z += NORM_OFFSET)
+                                    {
+
+                                        int hasDenormals = 0;
+                                        FloatPixel maxPixel =
+                                            sample_image_pixel_float_offset(
+                                                imagePtr, imageInfo,
+                                                xOffsetValues[j],
+                                                yOffsetValues[j],
+                                                zOffsetValues[j], norm_offset_x,
+                                                norm_offset_y, norm_offset_z,
+                                                imageSampler, expected, 0,
+                                                &hasDenormals, lod);
+
+                                        float err1 =
+                                            ABS_ERROR(sRGBmap(resultPtr[0]),
+                                                      sRGBmap(expected[0]));
+                                        float err2 =
+                                            ABS_ERROR(sRGBmap(resultPtr[1]),
+                                                      sRGBmap(expected[1]));
+                                        float err3 =
+                                            ABS_ERROR(sRGBmap(resultPtr[2]),
+                                                      sRGBmap(expected[2]));
+                                        float err4 = ABS_ERROR(resultPtr[3],
+                                                               expected[3]);
+                                        // Clamp to the minimum absolute error
+                                        // for the format
+                                        if (err1 > 0
+                                            && err1 < formatAbsoluteError)
+                                        {
+                                            err1 = 0.0f;
+                                        }
+                                        if (err2 > 0
+                                            && err2 < formatAbsoluteError)
+                                        {
+                                            err2 = 0.0f;
+                                        }
+                                        if (err3 > 0
+                                            && err3 < formatAbsoluteError)
+                                        {
+                                            err3 = 0.0f;
+                                        }
+                                        if (err4 > 0
+                                            && err4 < formatAbsoluteError)
+                                        {
+                                            err4 = 0.0f;
+                                        }
+                                        float maxErr = 0.5;
+
+                                        if (!(err1 <= maxErr)
+                                            || !(err2 <= maxErr)
+                                            || !(err3 <= maxErr)
+                                            || !(err4 <= maxErr))
+                                        {
+                                            // Try flushing the denormals
+                                            if (hasDenormals)
+                                            {
+                                                // If implementation decide to
+                                                // flush subnormals to zero, max
+                                                // error needs to be adjusted
+                                                maxErr += 4 * FLT_MIN;
+
+                                                maxPixel =
+                                                    sample_image_pixel_float_offset(
+                                                        imagePtr, imageInfo,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z,
+                                                        imageSampler, expected,
+                                                        0, NULL, lod);
+
+                                                err1 = ABS_ERROR(
+                                                    sRGBmap(resultPtr[0]),
+                                                    sRGBmap(expected[0]));
+                                                err2 = ABS_ERROR(
+                                                    sRGBmap(resultPtr[1]),
+                                                    sRGBmap(expected[1]));
+                                                err3 = ABS_ERROR(
+                                                    sRGBmap(resultPtr[2]),
+                                                    sRGBmap(expected[2]));
+                                                err4 = ABS_ERROR(resultPtr[3],
+                                                                 expected[3]);
+                                            }
+                                        }
+
+                                        found_pixel = (err1 <= maxErr)
+                                            && (err2 <= maxErr)
+                                            && (err3 <= maxErr)
+                                            && (err4 <= maxErr);
+                                    } // norm_offset_z
+                                } // norm_offset_y
+                            } // norm_offset_x
+
+                            // Step 2: If we did not find a match, then print
+                            // out debugging info.
+                            if (!found_pixel)
+                            {
+                                // For the normalized case on a GPU we put in
+                                // offsets to the X and Y to see if we land on
+                                // the right pixel. This addresses the
+                                // significant inaccuracy in GPU normalization
+                                // in OpenCL 1.0.
+                                checkOnlyOnePixel = 0;
+                                int shouldReturn = 0;
+                                for (float norm_offset_x = -offset;
+                                     norm_offset_x <= offset
+                                     && !checkOnlyOnePixel;
+                                     norm_offset_x += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_y = -offset;
+                                         norm_offset_y <= offset
+                                         && !checkOnlyOnePixel;
+                                         norm_offset_y += NORM_OFFSET)
+                                    {
+                                        for (float norm_offset_z = -offset;
+                                             norm_offset_z <= offset
+                                             && !checkOnlyOnePixel;
+                                             norm_offset_z += NORM_OFFSET)
+                                        {
+
+                                            int hasDenormals = 0;
+                                            FloatPixel maxPixel =
+                                                sample_image_pixel_float_offset(
+                                                    imagePtr, imageInfo,
+                                                    xOffsetValues[j],
+                                                    yOffsetValues[j],
+                                                    zOffsetValues[j],
+                                                    norm_offset_x,
+                                                    norm_offset_y,
+                                                    norm_offset_z, imageSampler,
+                                                    expected, 0, &hasDenormals,
+                                                    lod);
+
+                                            float err1 =
+                                                ABS_ERROR(sRGBmap(resultPtr[0]),
+                                                          sRGBmap(expected[0]));
+                                            float err2 =
+                                                ABS_ERROR(sRGBmap(resultPtr[1]),
+                                                          sRGBmap(expected[1]));
+                                            float err3 =
+                                                ABS_ERROR(sRGBmap(resultPtr[2]),
+                                                          sRGBmap(expected[2]));
+                                            float err4 = ABS_ERROR(resultPtr[3],
+                                                                   expected[3]);
+                                            float maxErr = 0.6;
+
+                                            if (!(err1 <= maxErr)
+                                                || !(err2 <= maxErr)
+                                                || !(err3 <= maxErr)
+                                                || !(err4 <= maxErr))
+                                            {
+                                                // Try flushing the denormals
+                                                if (hasDenormals)
+                                                {
+                                                    // If implementation decide
+                                                    // to flush subnormals to
+                                                    // zero, max error needs to
+                                                    // be adjusted
+                                                    maxErr += 4 * FLT_MIN;
+
+                                                    maxPixel =
+                                                        sample_image_pixel_float(
+                                                            imagePtr, imageInfo,
+                                                            xOffsetValues[j],
+                                                            yOffsetValues[j],
+                                                            zOffsetValues[j],
+                                                            imageSampler,
+                                                            expected, 0, NULL,
+                                                            lod);
+
+                                                    err1 = ABS_ERROR(
+                                                        sRGBmap(resultPtr[0]),
+                                                        sRGBmap(expected[0]));
+                                                    err2 = ABS_ERROR(
+                                                        sRGBmap(resultPtr[1]),
+                                                        sRGBmap(expected[1]));
+                                                    err3 = ABS_ERROR(
+                                                        sRGBmap(resultPtr[2]),
+                                                        sRGBmap(expected[2]));
+                                                    err4 =
+                                                        ABS_ERROR(resultPtr[3],
+                                                                  expected[3]);
+                                                }
+                                            }
+
+                                            if (!(err1 <= maxErr)
+                                                || !(err2 <= maxErr)
+                                                || !(err3 <= maxErr)
+                                                || !(err4 <= maxErr))
+                                            {
+                                                log_error(
+                                                    "FAILED norm_offsets: %g , "
+                                                    "%g , %g:\n",
+                                                    norm_offset_x,
+                                                    norm_offset_y,
+                                                    norm_offset_z);
+
+                                                float tempOut[4];
+                                                shouldReturn |=
+                                                    determine_validation_error_offset<
+                                                        float>(
+                                                        imagePtr, imageInfo,
+                                                        imageSampler, resultPtr,
+                                                        expected, error,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z, j,
+                                                        numTries, numClamped,
+                                                        true, lod);
+                                                log_error("Step by step:\n");
+                                                FloatPixel temp =
+                                                    sample_image_pixel_float_offset(
+                                                        imagePtr, imageInfo,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z,
+                                                        imageSampler, tempOut,
+                                                        1 /*verbose*/,
+                                                        &hasDenormals, lod);
+                                                log_error(
+                                                    "\tulps: %2.2f, %2.2f, "
+                                                    "%2.2f, %2.2f  (max "
+                                                    "allowed: %2.2f)\n\n",
+                                                    Ulp_Error(resultPtr[0],
+                                                              expected[0]),
+                                                    Ulp_Error(resultPtr[1],
+                                                              expected[1]),
+                                                    Ulp_Error(resultPtr[2],
+                                                              expected[2]),
+                                                    Ulp_Error(resultPtr[3],
+                                                              expected[3]),
+                                                    Ulp_Error(
+                                                        MAKE_HEX_FLOAT(
+                                                            0x1.000002p0f,
+                                                            0x1000002L, -24)
+                                                            + maxErr,
+                                                        MAKE_HEX_FLOAT(
+                                                            0x1.000002p0f,
+                                                            0x1000002L, -24)));
+                                            }
+                                            else
+                                            {
+                                                log_error(
+                                                    "Test error: we should "
+                                                    "have detected this "
+                                                    "passing above.\n");
+                                            }
+                                        } // norm_offset_z
+                                    } // norm_offset_y
+                                } // norm_offset_x
+                                if (shouldReturn) return 1;
+                            } // if (!found_pixel)
+
+                            resultPtr += 4;
+                        }
+                    }
+                }
+            }
+            /*
+             * FLOAT output type
+             */
+            else if (outputType == kFloat)
+            {
+                // Validate float results
+                float *resultPtr = (float *)(char *)resultValues;
+                float expected[4], error = 0.0f;
+                float maxErr = get_max_relative_error(
+                    imageInfo->format, imageSampler, 1 /*3D*/,
+                    CL_FILTER_LINEAR == imageSampler->filter_mode);
+
+                for (size_t z = 0, j = 0; z < depth_lod; z++)
+                {
+                    for (size_t y = 0; y < height_lod; y++)
+                    {
+                        for (size_t x = 0; x < width_lod; x++, j++)
+                        {
+                            // Step 1: go through and see if the results verify
+                            // for the pixel For the normalized case on a GPU we
+                            // put in offsets to the X, Y and Z to see if we
+                            // land on the right pixel. This addresses the
+                            // significant inaccuracy in GPU normalization in
+                            // OpenCL 1.0.
+                            int checkOnlyOnePixel = 0;
+                            int found_pixel = 0;
+                            float offset = NORM_OFFSET;
+                            if (!imageSampler->normalized_coords
+                                || imageSampler->filter_mode
+                                    != CL_FILTER_NEAREST
+                                || NORM_OFFSET == 0
+#if defined(__APPLE__)
+                                // Apple requires its CPU implementation to do
+                                // correctly rounded address arithmetic in all
+                                // modes
+                                || gDeviceType != CL_DEVICE_TYPE_GPU
+#endif
+                            )
+                                offset = 0.0f; // Loop only once
+
+                            for (float norm_offset_x = -offset;
+                                 norm_offset_x <= offset && !found_pixel;
+                                 norm_offset_x += NORM_OFFSET)
+                            {
+                                for (float norm_offset_y = -offset;
+                                     norm_offset_y <= offset && !found_pixel;
+                                     norm_offset_y += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_z = -offset;
+                                         norm_offset_z <= NORM_OFFSET
+                                         && !found_pixel;
+                                         norm_offset_z += NORM_OFFSET)
+                                    {
+
+                                        int hasDenormals = 0;
+                                        FloatPixel maxPixel =
+                                            sample_image_pixel_float_offset(
+                                                imagePtr, imageInfo,
+                                                xOffsetValues[j],
+                                                yOffsetValues[j],
+                                                zOffsetValues[j], norm_offset_x,
+                                                norm_offset_y, norm_offset_z,
+                                                imageSampler, expected, 0,
+                                                &hasDenormals, lod);
+
+                                        float err1 = ABS_ERROR(resultPtr[0],
+                                                               expected[0]);
+                                        float err2 = ABS_ERROR(resultPtr[1],
+                                                               expected[1]);
+                                        float err3 = ABS_ERROR(resultPtr[2],
+                                                               expected[2]);
+                                        float err4 = ABS_ERROR(resultPtr[3],
+                                                               expected[3]);
+                                        // Clamp to the minimum absolute error
+                                        // for the format
+                                        if (err1 > 0
+                                            && err1 < formatAbsoluteError)
+                                        {
+                                            err1 = 0.0f;
+                                        }
+                                        if (err2 > 0
+                                            && err2 < formatAbsoluteError)
+                                        {
+                                            err2 = 0.0f;
+                                        }
+                                        if (err3 > 0
+                                            && err3 < formatAbsoluteError)
+                                        {
+                                            err3 = 0.0f;
+                                        }
+                                        if (err4 > 0
+                                            && err4 < formatAbsoluteError)
+                                        {
+                                            err4 = 0.0f;
+                                        }
+                                        float maxErr1 = MAX(
+                                            maxErr * maxPixel.p[0], FLT_MIN);
+                                        float maxErr2 = MAX(
+                                            maxErr * maxPixel.p[1], FLT_MIN);
+                                        float maxErr3 = MAX(
+                                            maxErr * maxPixel.p[2], FLT_MIN);
+                                        float maxErr4 = MAX(
+                                            maxErr * maxPixel.p[3], FLT_MIN);
+
+                                        if (!(err1 <= maxErr1)
+                                            || !(err2 <= maxErr2)
+                                            || !(err3 <= maxErr3)
+                                            || !(err4 <= maxErr4))
+                                        {
+                                            // Try flushing the denormals
+                                            if (hasDenormals)
+                                            {
+                                                // If implementation decide to
+                                                // flush subnormals to zero, max
+                                                // error needs to be adjusted
+                                                maxErr1 += 4 * FLT_MIN;
+                                                maxErr2 += 4 * FLT_MIN;
+                                                maxErr3 += 4 * FLT_MIN;
+                                                maxErr4 += 4 * FLT_MIN;
+
+                                                maxPixel =
+                                                    sample_image_pixel_float_offset(
+                                                        imagePtr, imageInfo,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z,
+                                                        imageSampler, expected,
+                                                        0, NULL, lod);
+
+                                                err1 = ABS_ERROR(resultPtr[0],
+                                                                 expected[0]);
+                                                err2 = ABS_ERROR(resultPtr[1],
+                                                                 expected[1]);
+                                                err3 = ABS_ERROR(resultPtr[2],
+                                                                 expected[2]);
+                                                err4 = ABS_ERROR(resultPtr[3],
+                                                                 expected[3]);
+                                            }
+                                        }
+
+                                        found_pixel = (err1 <= maxErr1)
+                                            && (err2 <= maxErr2)
+                                            && (err3 <= maxErr3)
+                                            && (err4 <= maxErr4);
+                                    } // norm_offset_z
+                                } // norm_offset_y
+                            } // norm_offset_x
+
+                            // Step 2: If we did not find a match, then print
+                            // out debugging info.
+                            if (!found_pixel)
+                            {
+                                // For the normalized case on a GPU we put in
+                                // offsets to the X and Y to see if we land on
+                                // the right pixel. This addresses the
+                                // significant inaccuracy in GPU normalization
+                                // in OpenCL 1.0.
+                                checkOnlyOnePixel = 0;
+                                int shouldReturn = 0;
+                                for (float norm_offset_x = -offset;
+                                     norm_offset_x <= offset
+                                     && !checkOnlyOnePixel;
+                                     norm_offset_x += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_y = -offset;
+                                         norm_offset_y <= offset
+                                         && !checkOnlyOnePixel;
+                                         norm_offset_y += NORM_OFFSET)
+                                    {
+                                        for (float norm_offset_z = -offset;
+                                             norm_offset_z <= offset
+                                             && !checkOnlyOnePixel;
+                                             norm_offset_z += NORM_OFFSET)
+                                        {
+
+                                            int hasDenormals = 0;
+                                            FloatPixel maxPixel =
+                                                sample_image_pixel_float_offset(
+                                                    imagePtr, imageInfo,
+                                                    xOffsetValues[j],
+                                                    yOffsetValues[j],
+                                                    zOffsetValues[j],
+                                                    norm_offset_x,
+                                                    norm_offset_y,
+                                                    norm_offset_z, imageSampler,
+                                                    expected, 0, &hasDenormals,
+                                                    lod);
+
+                                            float err1 = ABS_ERROR(resultPtr[0],
+                                                                   expected[0]);
+                                            float err2 = ABS_ERROR(resultPtr[1],
+                                                                   expected[1]);
+                                            float err3 = ABS_ERROR(resultPtr[2],
+                                                                   expected[2]);
+                                            float err4 = ABS_ERROR(resultPtr[3],
+                                                                   expected[3]);
+                                            float maxErr1 =
+                                                MAX(maxErr * maxPixel.p[0],
+                                                    FLT_MIN);
+                                            float maxErr2 =
+                                                MAX(maxErr * maxPixel.p[1],
+                                                    FLT_MIN);
+                                            float maxErr3 =
+                                                MAX(maxErr * maxPixel.p[2],
+                                                    FLT_MIN);
+                                            float maxErr4 =
+                                                MAX(maxErr * maxPixel.p[3],
+                                                    FLT_MIN);
+
+
+                                            if (!(err1 <= maxErr1)
+                                                || !(err2 <= maxErr2)
+                                                || !(err3 <= maxErr3)
+                                                || !(err4 <= maxErr4))
+                                            {
+                                                // Try flushing the denormals
+                                                if (hasDenormals)
+                                                {
+                                                    maxErr1 += 4 * FLT_MIN;
+                                                    maxErr2 += 4 * FLT_MIN;
+                                                    maxErr3 += 4 * FLT_MIN;
+                                                    maxErr4 += 4 * FLT_MIN;
+
+                                                    maxPixel =
+                                                        sample_image_pixel_float(
+                                                            imagePtr, imageInfo,
+                                                            xOffsetValues[j],
+                                                            yOffsetValues[j],
+                                                            zOffsetValues[j],
+                                                            imageSampler,
+                                                            expected, 0, NULL,
+                                                            lod);
+
+                                                    err1 =
+                                                        ABS_ERROR(resultPtr[0],
+                                                                  expected[0]);
+                                                    err2 =
+                                                        ABS_ERROR(resultPtr[1],
+                                                                  expected[1]);
+                                                    err3 =
+                                                        ABS_ERROR(resultPtr[2],
+                                                                  expected[2]);
+                                                    err4 =
+                                                        ABS_ERROR(resultPtr[3],
+                                                                  expected[3]);
+                                                }
+                                            }
+
+                                            if (!(err1 <= maxErr1)
+                                                || !(err2 <= maxErr2)
+                                                || !(err3 <= maxErr3)
+                                                || !(err4 <= maxErr4))
+                                            {
+                                                log_error(
+                                                    "FAILED norm_offsets: %g , "
+                                                    "%g , %g:\n",
+                                                    norm_offset_x,
+                                                    norm_offset_y,
+                                                    norm_offset_z);
+
+                                                float tempOut[4];
+                                                shouldReturn |=
+                                                    determine_validation_error_offset<
+                                                        float>(
+                                                        imagePtr, imageInfo,
+                                                        imageSampler, resultPtr,
+                                                        expected, error,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z, j,
+                                                        numTries, numClamped,
+                                                        true, lod);
+                                                log_error("Step by step:\n");
+                                                FloatPixel temp =
+                                                    sample_image_pixel_float_offset(
+                                                        imagePtr, imageInfo,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z,
+                                                        imageSampler, tempOut,
+                                                        1 /*verbose*/,
+                                                        &hasDenormals, lod);
+                                                log_error(
+                                                    "\tulps: %2.2f, %2.2f, "
+                                                    "%2.2f, %2.2f  (max "
+                                                    "allowed: %2.2f)\n\n",
+                                                    Ulp_Error(resultPtr[0],
+                                                              expected[0]),
+                                                    Ulp_Error(resultPtr[1],
+                                                              expected[1]),
+                                                    Ulp_Error(resultPtr[2],
+                                                              expected[2]),
+                                                    Ulp_Error(resultPtr[3],
+                                                              expected[3]),
+                                                    Ulp_Error(
+                                                        MAKE_HEX_FLOAT(
+                                                            0x1.000002p0f,
+                                                            0x1000002L, -24)
+                                                            + maxErr,
+                                                        MAKE_HEX_FLOAT(
+                                                            0x1.000002p0f,
+                                                            0x1000002L, -24)));
+                                            }
+                                            else
+                                            {
+                                                log_error(
+                                                    "Test error: we should "
+                                                    "have detected this "
+                                                    "passing above.\n");
+                                            }
+                                        } // norm_offset_z
+                                    } // norm_offset_y
+                                } // norm_offset_x
+                                if (shouldReturn) return 1;
+                            } // if (!found_pixel)
+
+                            resultPtr += 4;
+                        }
+                    }
+                }
+            }
+            /*
+             * UINT output type
+             */
+            else if (outputType == kUInt)
+            {
+                // Validate unsigned integer results
+                unsigned int *resultPtr = (unsigned int *)(char *)resultValues;
+                unsigned int expected[4];
+                float error;
+                for (size_t z = 0, j = 0; z < depth_lod; z++)
+                {
+                    for (size_t y = 0; y < height_lod; y++)
+                    {
+                        for (size_t x = 0; x < width_lod; x++, j++)
+                        {
+                            // Step 1: go through and see if the results verify
+                            // for the pixel For the normalized case on a GPU we
+                            // put in offsets to the X, Y and Z to see if we
+                            // land on the right pixel. This addresses the
+                            // significant inaccuracy in GPU normalization in
+                            // OpenCL 1.0.
+                            int checkOnlyOnePixel = 0;
+                            int found_pixel = 0;
+                            for (float norm_offset_x = -NORM_OFFSET;
+                                 norm_offset_x <= NORM_OFFSET && !found_pixel
+                                 && !checkOnlyOnePixel;
+                                 norm_offset_x += NORM_OFFSET)
+                            {
+                                for (float norm_offset_y = -NORM_OFFSET;
+                                     norm_offset_y <= NORM_OFFSET
+                                     && !found_pixel && !checkOnlyOnePixel;
+                                     norm_offset_y += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_z = -NORM_OFFSET;
+                                         norm_offset_z <= NORM_OFFSET
+                                         && !found_pixel && !checkOnlyOnePixel;
+                                         norm_offset_z += NORM_OFFSET)
+                                    {
+
+                                        // If we are not on a GPU, or we are not
+                                        // normalized, then only test with
+                                        // offsets (0.0, 0.0) E.g., test one
+                                        // pixel.
+                                        if (!imageSampler->normalized_coords
+                                            || gDeviceType != CL_DEVICE_TYPE_GPU
+                                            || NORM_OFFSET == 0)
+                                        {
+                                            norm_offset_x = 0.0f;
+                                            norm_offset_y = 0.0f;
+                                            norm_offset_z = 0.0f;
+                                            checkOnlyOnePixel = 1;
+                                        }
+
+                                        sample_image_pixel_offset<unsigned int>(
+                                            imagePtr, imageInfo,
+                                            xOffsetValues[j], yOffsetValues[j],
+                                            zOffsetValues[j], norm_offset_x,
+                                            norm_offset_y, norm_offset_z,
+                                            imageSampler, expected, lod);
+
+                                        error = errMax(
+                                            errMax(abs_diff_uint(expected[0],
+                                                                 resultPtr[0]),
+                                                   abs_diff_uint(expected[1],
+                                                                 resultPtr[1])),
+                                            errMax(
+                                                abs_diff_uint(expected[2],
+                                                              resultPtr[2]),
+                                                abs_diff_uint(expected[3],
+                                                              resultPtr[3])));
+
+                                        if (error < MAX_ERR) found_pixel = 1;
+                                    } // norm_offset_z
+                                } // norm_offset_y
+                            } // norm_offset_x
+
+                            // Step 2: If we did not find a match, then print
+                            // out debugging info.
+                            if (!found_pixel)
+                            {
+                                // For the normalized case on a GPU we put in
+                                // offsets to the X and Y to see if we land on
+                                // the right pixel. This addresses the
+                                // significant inaccuracy in GPU normalization
+                                // in OpenCL 1.0.
+                                checkOnlyOnePixel = 0;
+                                int shouldReturn = 0;
+                                for (float norm_offset_x = -NORM_OFFSET;
+                                     norm_offset_x <= NORM_OFFSET
+                                     && !checkOnlyOnePixel;
+                                     norm_offset_x += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_y = -NORM_OFFSET;
+                                         norm_offset_y <= NORM_OFFSET
+                                         && !checkOnlyOnePixel;
+                                         norm_offset_y += NORM_OFFSET)
+                                    {
+                                        for (float norm_offset_z = -NORM_OFFSET;
+                                             norm_offset_z <= NORM_OFFSET
+                                             && !checkOnlyOnePixel;
+                                             norm_offset_z += NORM_OFFSET)
+                                        {
+
+                                            // If we are not on a GPU, or we are
+                                            // not normalized, then only test
+                                            // with offsets (0.0, 0.0) E.g.,
+                                            // test one pixel.
+                                            if (!imageSampler->normalized_coords
+                                                || gDeviceType
+                                                    != CL_DEVICE_TYPE_GPU
+                                                || NORM_OFFSET == 0)
+                                            {
+                                                norm_offset_x = 0.0f;
+                                                norm_offset_y = 0.0f;
+                                                norm_offset_z = 0.0f;
+                                                checkOnlyOnePixel = 1;
+                                            }
+
+                                            sample_image_pixel_offset<
+                                                unsigned int>(
+                                                imagePtr, imageInfo,
+                                                xOffsetValues[j],
+                                                yOffsetValues[j],
+                                                zOffsetValues[j], norm_offset_x,
+                                                norm_offset_y, norm_offset_z,
+                                                imageSampler, expected, lod);
+
+                                            error = errMax(
+                                                errMax(
+                                                    abs_diff_uint(expected[0],
+                                                                  resultPtr[0]),
+                                                    abs_diff_uint(
+                                                        expected[1],
+                                                        resultPtr[1])),
+                                                errMax(
+                                                    abs_diff_uint(expected[2],
+                                                                  resultPtr[2]),
+                                                    abs_diff_uint(
+                                                        expected[3],
+                                                        resultPtr[3])));
+
+                                            if (error > MAX_ERR)
+                                            {
+                                                log_error(
+                                                    "FAILED norm_offsets: %g , "
+                                                    "%g , %g:\n",
+                                                    norm_offset_x,
+                                                    norm_offset_y,
+                                                    norm_offset_z);
+                                                shouldReturn |=
+                                                    determine_validation_error_offset<
+                                                        unsigned int>(
+                                                        imagePtr, imageInfo,
+                                                        imageSampler, resultPtr,
+                                                        expected, error,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z, j,
+                                                        numTries, numClamped,
+                                                        false, lod);
+                                            }
+                                            else
+                                            {
+                                                log_error(
+                                                    "Test error: we should "
+                                                    "have detected this "
+                                                    "passing above.\n");
+                                            }
+                                        } // norm_offset_z
+                                    } // norm_offset_y
+                                } // norm_offset_x
+                                if (shouldReturn) return 1;
+                            } // if (!found_pixel)
+
+                            resultPtr += 4;
+                        }
+                    }
+                }
+            }
+            else
+            /*
+             * INT output type
+             */
+            {
+                // Validate integer results
+                int *resultPtr = (int *)(char *)resultValues;
+                int expected[4];
+                float error;
+                for (size_t z = 0, j = 0; z < depth_lod; z++)
+                {
+                    for (size_t y = 0; y < height_lod; y++)
+                    {
+                        for (size_t x = 0; x < width_lod; x++, j++)
+                        {
+                            // Step 1: go through and see if the results verify
+                            // for the pixel For the normalized case on a GPU we
+                            // put in offsets to the X, Y and Z to see if we
+                            // land on the right pixel. This addresses the
+                            // significant inaccuracy in GPU normalization in
+                            // OpenCL 1.0.
+                            int checkOnlyOnePixel = 0;
+                            int found_pixel = 0;
+                            for (float norm_offset_x = -NORM_OFFSET;
+                                 norm_offset_x <= NORM_OFFSET && !found_pixel
+                                 && !checkOnlyOnePixel;
+                                 norm_offset_x += NORM_OFFSET)
+                            {
+                                for (float norm_offset_y = -NORM_OFFSET;
+                                     norm_offset_y <= NORM_OFFSET
+                                     && !found_pixel && !checkOnlyOnePixel;
+                                     norm_offset_y += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_z = -NORM_OFFSET;
+                                         norm_offset_z <= NORM_OFFSET
+                                         && !found_pixel && !checkOnlyOnePixel;
+                                         norm_offset_z += NORM_OFFSET)
+                                    {
+
+                                        // If we are not on a GPU, or we are not
+                                        // normalized, then only test with
+                                        // offsets (0.0, 0.0) E.g., test one
+                                        // pixel.
+                                        if (!imageSampler->normalized_coords
+                                            || gDeviceType != CL_DEVICE_TYPE_GPU
+                                            || NORM_OFFSET == 0)
+                                        {
+                                            norm_offset_x = 0.0f;
+                                            norm_offset_y = 0.0f;
+                                            norm_offset_z = 0.0f;
+                                            checkOnlyOnePixel = 1;
+                                        }
+
+                                        sample_image_pixel_offset<int>(
+                                            imagePtr, imageInfo,
+                                            xOffsetValues[j], yOffsetValues[j],
+                                            zOffsetValues[j], norm_offset_x,
+                                            norm_offset_y, norm_offset_z,
+                                            imageSampler, expected, lod);
+
+                                        error = errMax(
+                                            errMax(abs_diff_int(expected[0],
+                                                                resultPtr[0]),
+                                                   abs_diff_int(expected[1],
+                                                                resultPtr[1])),
+                                            errMax(abs_diff_int(expected[2],
+                                                                resultPtr[2]),
+                                                   abs_diff_int(expected[3],
+                                                                resultPtr[3])));
+
+                                        if (error < MAX_ERR) found_pixel = 1;
+                                    } // norm_offset_z
+                                } // norm_offset_y
+                            } // norm_offset_x
+
+                            // Step 2: If we did not find a match, then print
+                            // out debugging info.
+                            if (!found_pixel)
+                            {
+                                // For the normalized case on a GPU we put in
+                                // offsets to the X and Y to see if we land on
+                                // the right pixel. This addresses the
+                                // significant inaccuracy in GPU normalization
+                                // in OpenCL 1.0.
+                                checkOnlyOnePixel = 0;
+                                int shouldReturn = 0;
+                                for (float norm_offset_x = -NORM_OFFSET;
+                                     norm_offset_x <= NORM_OFFSET
+                                     && !checkOnlyOnePixel;
+                                     norm_offset_x += NORM_OFFSET)
+                                {
+                                    for (float norm_offset_y = -NORM_OFFSET;
+                                         norm_offset_y <= NORM_OFFSET
+                                         && !checkOnlyOnePixel;
+                                         norm_offset_y += NORM_OFFSET)
+                                    {
+                                        for (float norm_offset_z = -NORM_OFFSET;
+                                             norm_offset_z <= NORM_OFFSET
+                                             && !checkOnlyOnePixel;
+                                             norm_offset_z += NORM_OFFSET)
+                                        {
+
+                                            // If we are not on a GPU, or we are
+                                            // not normalized, then only test
+                                            // with offsets (0.0, 0.0) E.g.,
+                                            // test one pixel.
+                                            if (!imageSampler->normalized_coords
+                                                || gDeviceType
+                                                    != CL_DEVICE_TYPE_GPU
+                                                || NORM_OFFSET == 0
+                                                || NORM_OFFSET == 0
+                                                || NORM_OFFSET == 0)
+                                            {
+                                                norm_offset_x = 0.0f;
+                                                norm_offset_y = 0.0f;
+                                                norm_offset_z = 0.0f;
+                                                checkOnlyOnePixel = 1;
+                                            }
+
+                                            sample_image_pixel_offset<int>(
+                                                imagePtr, imageInfo,
+                                                xOffsetValues[j],
+                                                yOffsetValues[j],
+                                                zOffsetValues[j], norm_offset_x,
+                                                norm_offset_y, norm_offset_z,
+                                                imageSampler, expected, lod);
+
+                                            error = errMax(
+                                                errMax(
+                                                    abs_diff_int(expected[0],
+                                                                 resultPtr[0]),
+                                                    abs_diff_int(expected[1],
+                                                                 resultPtr[1])),
+                                                errMax(
+                                                    abs_diff_int(expected[2],
+                                                                 resultPtr[2]),
+                                                    abs_diff_int(
+                                                        expected[3],
+                                                        resultPtr[3])));
+
+                                            if (error > MAX_ERR)
+                                            {
+                                                log_error(
+                                                    "FAILED norm_offsets: %g , "
+                                                    "%g , %g:\n",
+                                                    norm_offset_x,
+                                                    norm_offset_y,
+                                                    norm_offset_z);
+                                                shouldReturn |=
+                                                    determine_validation_error_offset<
+                                                        int>(
+                                                        imagePtr, imageInfo,
+                                                        imageSampler, resultPtr,
+                                                        expected, error,
+                                                        xOffsetValues[j],
+                                                        yOffsetValues[j],
+                                                        zOffsetValues[j],
+                                                        norm_offset_x,
+                                                        norm_offset_y,
+                                                        norm_offset_z, j,
+                                                        numTries, numClamped,
+                                                        false, lod);
+                                            }
+                                            else
+                                            {
+                                                log_error(
+                                                    "Test error: we should "
+                                                    "have detected this "
+                                                    "passing above.\n");
+                                            }
+                                        } // norm_offset_z
+                                    } // norm_offset_y
+                                } // norm_offset_x
+                                if (shouldReturn) return 1;
+                            } // if (!found_pixel)
+
+                            resultPtr += 4;
+                        }
+                    }
+                }
+            }
+        }
+        {
+            nextLevelOffset += width_lod * height_lod * depth_lod
+                * get_pixel_size(imageInfo->format);
+            width_lod = (width_lod >> 1) ? (width_lod >> 1) : 1;
+            height_lod = (height_lod >> 1) ? (height_lod >> 1) : 1;
+            depth_lod = (depth_lod >> 1) ? (depth_lod >> 1) : 1;
+        }
+    }
+
+    return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
+}
\ No newline at end of file
diff --git a/test_conformance/images/kernel_read_write/test_common.h b/test_conformance/images/kernel_read_write/test_common.h
index cb0d54a48c..e7ecbe0b55 100644
--- a/test_conformance/images/kernel_read_write/test_common.h
+++ b/test_conformance/images/kernel_read_write/test_common.h
@@ -1,3 +1,18 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "../testBase.h"
 
@@ -10,7 +25,207 @@
 #define MAX_CLAMPED 1
 
 extern cl_sampler create_sampler(cl_context context, image_sampler_data *sdata, bool test_mipmaps, cl_int *error);
+extern void read_image_pixel_float(void *imageData, image_descriptor *imageInfo,
+                                   int x, int y, int z, float *outData);
 
 extern bool gExtraValidateInfo;
 extern bool gDisableOffsets;
 extern bool gUseKernelSamplers;
+extern cl_mem_flags gMemFlagsToUse;
+extern int gtestTypesToRun;
+extern uint64_t gRoundingStartValue;
+extern bool gPrintOptions;
+
+extern int test_read_image(cl_context context, cl_command_queue queue,
+                           cl_kernel kernel, image_descriptor *imageInfo,
+                           image_sampler_data *imageSampler,
+                           bool useFloatCoords, ExplicitType outputType,
+                           MTdata d);
+
+extern void InitFloatCoordsCommon(image_descriptor *imageInfo,
+                                  image_sampler_data *imageSampler,
+                                  float *xOffsets, float *yOffsets,
+                                  float *zOffsets, float xfract, float yfract,
+                                  float zfract, int normalized_coords, MTdata d,
+                                  int lod);
+
+template <class T>
+int determine_validation_error_offset(
+    void *imagePtr, image_descriptor *imageInfo,
+    image_sampler_data *imageSampler, T *resultPtr, T *expected, float error,
+    float x, float y, float z, float xAddressOffset, float yAddressOffset,
+    float zAddressOffset, size_t j, int &numTries, int &numClamped,
+    bool printAsFloat, int lod)
+{
+    int actualX, actualY, actualZ;
+    int found = debug_find_pixel_in_image(imagePtr, imageInfo, resultPtr,
+                                          &actualX, &actualY, &actualZ, lod);
+    bool clampingErr = false, clamped = false, otherClampingBug = false;
+    int clampedX, clampedY, clampedZ;
+
+    size_t imageWidth = imageInfo->width, imageHeight = imageInfo->height,
+           imageDepth = imageInfo->depth;
+
+    clamped = get_integer_coords_offset(x, y, z, xAddressOffset, yAddressOffset,
+                                        zAddressOffset, imageWidth, imageHeight,
+                                        imageDepth, imageSampler, imageInfo,
+                                        clampedX, clampedY, clampedZ);
+
+    if (found)
+    {
+        // Is it a clamping bug?
+        if (clamped && clampedX == actualX && clampedY == actualY
+            && clampedZ == actualZ)
+        {
+            if ((--numClamped) == 0)
+            {
+                if (printAsFloat)
+                {
+                    log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did "
+                              "not validate! Expected (%g,%g,%g,%g), got "
+                              "(%g,%g,%g,%g), error of %g\n",
+                              j, x, x, y, y, z, z, (float)expected[0],
+                              (float)expected[1], (float)expected[2],
+                              (float)expected[3], (float)resultPtr[0],
+                              (float)resultPtr[1], (float)resultPtr[2],
+                              (float)resultPtr[3], error);
+                }
+                else
+                {
+                    log_error(
+                        "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
+                        "validate! Expected (%x,%x,%x,%x), got (%x,%x,%x,%x)\n",
+                        j, x, x, y, y, z, z, (int)expected[0], (int)expected[1],
+                        (int)expected[2], (int)expected[3], (int)resultPtr[0],
+                        (int)resultPtr[1], (int)resultPtr[2],
+                        (int)resultPtr[3]);
+                }
+                log_error("ERROR: TEST FAILED: Read is erroneously clamping "
+                          "coordinates!\n");
+                return -1;
+            }
+            clampingErr = true;
+            otherClampingBug = true;
+        }
+    }
+    if (clamped && !otherClampingBug)
+    {
+        // If we are in clamp-to-edge mode and we're getting zeroes, it's
+        // possible we're getting border erroneously
+        if (resultPtr[0] == 0 && resultPtr[1] == 0 && resultPtr[2] == 0
+            && resultPtr[3] == 0)
+        {
+            if ((--numClamped) == 0)
+            {
+                if (printAsFloat)
+                {
+                    log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did "
+                              "not validate! Expected (%g,%g,%g,%g), got "
+                              "(%g,%g,%g,%g), error of %g\n",
+                              j, x, x, y, y, z, z, (float)expected[0],
+                              (float)expected[1], (float)expected[2],
+                              (float)expected[3], (float)resultPtr[0],
+                              (float)resultPtr[1], (float)resultPtr[2],
+                              (float)resultPtr[3], error);
+                }
+                else
+                {
+                    log_error(
+                        "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
+                        "validate! Expected (%x,%x,%x,%x), got (%x,%x,%x,%x)\n",
+                        j, x, x, y, y, z, z, (int)expected[0], (int)expected[1],
+                        (int)expected[2], (int)expected[3], (int)resultPtr[0],
+                        (int)resultPtr[1], (int)resultPtr[2],
+                        (int)resultPtr[3]);
+                }
+                log_error("ERROR: TEST FAILED: Clamping is erroneously "
+                          "returning border color!\n");
+                return -1;
+            }
+            clampingErr = true;
+        }
+    }
+    if (!clampingErr)
+    {
+        /*        if( clamped && ( (int)x + (int)xOffsetValues[ j ] < 0 ||
+         (int)y + (int)yOffsetValues[ j ] < 0 ) )
+         {
+         log_error( "NEGATIVE COORDINATE ERROR\n" );
+         return -1;
+         }
+         */
+        if (true) // gExtraValidateInfo )
+        {
+            if (printAsFloat)
+            {
+                log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
+                          "validate!\n\tExpected (%g,%g,%g,%g),\n\t     got "
+                          "(%g,%g,%g,%g), error of %g\n",
+                          j, x, x, y, y, z, z, (float)expected[0],
+                          (float)expected[1], (float)expected[2],
+                          (float)expected[3], (float)resultPtr[0],
+                          (float)resultPtr[1], (float)resultPtr[2],
+                          (float)resultPtr[3], error);
+            }
+            else
+            {
+                log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not "
+                          "validate!\n\tExpected (%x,%x,%x,%x),\n\t     got "
+                          "(%x,%x,%x,%x)\n",
+                          j, x, x, y, y, z, z, (int)expected[0],
+                          (int)expected[1], (int)expected[2], (int)expected[3],
+                          (int)resultPtr[0], (int)resultPtr[1],
+                          (int)resultPtr[2], (int)resultPtr[3]);
+            }
+            log_error(
+                "Integer coords resolve to %d,%d,%d   with img size %d,%d,%d\n",
+                clampedX, clampedY, clampedZ, (int)imageWidth, (int)imageHeight,
+                (int)imageDepth);
+
+            if (printAsFloat && gExtraValidateInfo)
+            {
+                log_error("\nNearby values:\n");
+                for (int zOff = -1; zOff <= 1; zOff++)
+                {
+                    for (int yOff = -1; yOff <= 1; yOff++)
+                    {
+                        float top[4], real[4], bot[4];
+                        read_image_pixel_float(imagePtr, imageInfo,
+                                               clampedX - 1, clampedY + yOff,
+                                               clampedZ + zOff, top);
+                        read_image_pixel_float(imagePtr, imageInfo, clampedX,
+                                               clampedY + yOff, clampedZ + zOff,
+                                               real);
+                        read_image_pixel_float(imagePtr, imageInfo,
+                                               clampedX + 1, clampedY + yOff,
+                                               clampedZ + zOff, bot);
+                        log_error("\t(%g,%g,%g,%g)", top[0], top[1], top[2],
+                                  top[3]);
+                        log_error(" (%g,%g,%g,%g)", real[0], real[1], real[2],
+                                  real[3]);
+                        log_error(" (%g,%g,%g,%g)\n", bot[0], bot[1], bot[2],
+                                  bot[3]);
+                    }
+                }
+            }
+            //        }
+            //        else
+            //            log_error( "\n" );
+            if (imageSampler->filter_mode != CL_FILTER_LINEAR)
+            {
+                if (found)
+                    log_error(
+                        "\tValue really found in image at %d,%d,%d (%s)\n",
+                        actualX, actualY, actualZ,
+                        (found > 1) ? "NOT unique!!" : "unique");
+                else
+                    log_error("\tValue not actually found in image\n");
+            }
+            log_error("\n");
+        }
+
+        numClamped = -1; // We force the clamped counter to never work
+        if ((--numTries) == 0) return -1;
+    }
+    return 0;
+}
diff --git a/test_conformance/images/kernel_read_write/test_iterations.cpp b/test_conformance/images/kernel_read_write/test_iterations.cpp
index 41cf5d3e80..03ca9595ff 100644
--- a/test_conformance/images/kernel_read_write/test_iterations.cpp
+++ b/test_conformance/images/kernel_read_write/test_iterations.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017, 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -23,9 +23,6 @@
 #endif
 
 extern bool gTestImage2DFromBuffer;
-extern uint64_t gRoundingStartValue;
-extern cl_mem_flags gMemFlagsToUse;
-extern int gtestTypesToRun;
 
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
@@ -75,8 +72,6 @@ static const char *lodOffsetSource =
 static const char *offsetSource =
 "   int offset = tidY*get_image_width(input) + tidX;\n";
 
-extern void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
-                            int x, int y, int z, float *outData );
 template <class T> int determine_validation_error( void *imagePtr, image_descriptor *imageInfo, image_sampler_data *imageSampler,
                                                 T *resultPtr, T * expected, float error,
                                 float x, float y, float xAddressOffset, float yAddressOffset, size_t j, int &numTries, int &numClamped, bool printAsFloat, int lod = 0 )
diff --git a/test_conformance/images/kernel_read_write/test_read_1D.cpp b/test_conformance/images/kernel_read_write/test_read_1D.cpp
index 606d74fa7e..c9ba4e847b 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017, 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -23,10 +23,6 @@
     #include <setjmp.h>
 #endif
 
-extern uint64_t gRoundingStartValue;
-extern cl_mem_flags gMemFlagsToUse;
-extern int gtestTypesToRun;
-
 const char *read1DKernelSourcePattern =
 "__kernel void sample_kernel( read_only image1d_t input,%s __global float *xOffsets, __global %s4 *results %s)\n"
 "{\n"
@@ -55,8 +51,6 @@ const char *float1DKernelSource =
 
 static const char *samplerKernelArg = " sampler_t imageSampler,";
 
-extern void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
-                            int x, int y, int z, float *outData );
 template <class T> int determine_validation_error_1D( void *imagePtr, image_descriptor *imageInfo, image_sampler_data *imageSampler,
                                                 T *resultPtr, T * expected, float error,
                                 float x, float xAddressOffset, size_t j, int &numTries, int &numClamped, bool printAsFloat, int lod )
diff --git a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
index f2b723ad82..b3287ded9c 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017, 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,9 +22,6 @@
 #include <setjmp.h>
 #endif
 
-extern uint64_t gRoundingStartValue;
-extern cl_mem_flags gMemFlagsToUse;
-extern int gtestTypesToRun;
 
 const char *read1DArrayKernelSourcePattern =
 "__kernel void sample_kernel( read_only image1d_array_t input,%s __global float *xOffsets, __global float *yOffsets, __global %s4 *results %s)\n"
@@ -62,12 +59,6 @@ const char *floatKernelSource1DArray =
 
 static const char *samplerKernelArg = " sampler_t imageSampler,";
 
-extern void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
-                                   int x, int y, int z, float *outData );
-
-extern void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
-                                   int x, int y, int z, float *outData , int lod);
-
 template <class T> int determine_validation_error_1D_arr( void *imagePtr, image_descriptor *imageInfo, image_sampler_data *imageSampler,
                                                   T *resultPtr, T * expected, float error,
                                                   float x, float y, float xAddressOffset, float yAddressOffset, size_t j, int &numTries, int &numClamped, bool printAsFloat, int lod )
diff --git a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
index 55c03d75a8..7cb334b23f 100644
--- a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017, 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,9 +16,6 @@
 #include "test_common.h"
 #include <float.h>
 
-extern cl_mem_flags gMemFlagsToUse;
-extern int gtestTypesToRun;
-
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
 static size_t reduceImageSizeRange(size_t maxDimSize) {
@@ -78,7 +75,6 @@ const char *float2DArrayUnnormalizedCoordKernelSource =
 
 static const char *samplerKernelArg = " sampler_t imageSampler,";
 
-extern void read_image_pixel_float( void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData );
 template <class T> int determine_validation_error_offset_2D_array( void *imagePtr, image_descriptor *imageInfo, image_sampler_data *imageSampler,
                                                          T *resultPtr, T * expected, float error,
                                                          float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, size_t j, int &numTries, int &numClamped, bool printAsFloat, int lod )
diff --git a/test_conformance/images/kernel_read_write/test_read_3D.cpp b/test_conformance/images/kernel_read_write/test_read_3D.cpp
index 7b598132ed..860114fba4 100644
--- a/test_conformance/images/kernel_read_write/test_read_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_3D.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017, 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,9 +16,6 @@
 #include "test_common.h"
 #include <float.h>
 
-extern cl_mem_flags gMemFlagsToUse;
-extern int gtestTypesToRun;
-
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
 static size_t reduceImageSizeRange(size_t maxDimSize, RandomSeed& seed) {
@@ -76,1087 +73,6 @@ const char *float3DUnnormalizedCoordKernelSource =
 
 static const char *samplerKernelArg = " sampler_t imageSampler,";
 
-extern void read_image_pixel_float( void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData );
-template <class T> int determine_validation_error_offset( void *imagePtr, image_descriptor *imageInfo, image_sampler_data *imageSampler,
-                                                         T *resultPtr, T * expected, float error,
-                                                         float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, size_t j, int &numTries, int &numClamped, bool printAsFloat, int lod )
-{
-    int actualX, actualY, actualZ;
-    int found = debug_find_pixel_in_image( imagePtr, imageInfo, resultPtr, &actualX, &actualY, &actualZ, lod );
-    bool clampingErr = false, clamped = false, otherClampingBug = false;
-    int clampedX, clampedY, clampedZ;
-
-    size_t imageWidth = imageInfo->width, imageHeight = imageInfo->height, imageDepth = imageInfo->depth;
-
-    clamped = get_integer_coords_offset( x, y, z, xAddressOffset, yAddressOffset, zAddressOffset, imageWidth, imageHeight, imageDepth, imageSampler, imageInfo, clampedX, clampedY, clampedZ );
-
-    if( found )
-    {
-        // Is it a clamping bug?
-        if( clamped && clampedX == actualX && clampedY == actualY && clampedZ == actualZ )
-        {
-            if( (--numClamped) == 0 )
-            {
-                if( printAsFloat )
-                {
-                    log_error( "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not validate! Expected (%g,%g,%g,%g), got (%g,%g,%g,%g), error of %g\n",
-                              j, x, x, y, y, z, z, (float)expected[ 0 ], (float)expected[ 1 ], (float)expected[ 2 ], (float)expected[ 3 ],
-                              (float)resultPtr[ 0 ], (float)resultPtr[ 1 ], (float)resultPtr[ 2 ], (float)resultPtr[ 3 ], error );
-                }
-                else
-                {
-                    log_error( "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not validate! Expected (%x,%x,%x,%x), got (%x,%x,%x,%x)\n",
-                              j, x, x, y, y, z, z, (int)expected[ 0 ], (int)expected[ 1 ], (int)expected[ 2 ], (int)expected[ 3 ],
-                              (int)resultPtr[ 0 ], (int)resultPtr[ 1 ], (int)resultPtr[ 2 ], (int)resultPtr[ 3 ] );
-                }
-                log_error( "ERROR: TEST FAILED: Read is erroneously clamping coordinates!\n" );
-                return -1;
-            }
-            clampingErr = true;
-            otherClampingBug = true;
-        }
-    }
-    if( clamped && !otherClampingBug )
-    {
-        // If we are in clamp-to-edge mode and we're getting zeroes, it's possible we're getting border erroneously
-        if( resultPtr[ 0 ] == 0 && resultPtr[ 1 ] == 0 && resultPtr[ 2 ] == 0 && resultPtr[ 3 ] == 0 )
-        {
-            if( (--numClamped) == 0 )
-            {
-                if( printAsFloat )
-                {
-                    log_error( "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not validate! Expected (%g,%g,%g,%g), got (%g,%g,%g,%g), error of %g\n",
-                              j, x, x, y, y, z, z, (float)expected[ 0 ], (float)expected[ 1 ], (float)expected[ 2 ], (float)expected[ 3 ],
-                              (float)resultPtr[ 0 ], (float)resultPtr[ 1 ], (float)resultPtr[ 2 ], (float)resultPtr[ 3 ], error );
-                }
-                else
-                {
-                    log_error( "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not validate! Expected (%x,%x,%x,%x), got (%x,%x,%x,%x)\n",
-                              j, x, x, y, y, z, z, (int)expected[ 0 ], (int)expected[ 1 ], (int)expected[ 2 ], (int)expected[ 3 ],
-                              (int)resultPtr[ 0 ], (int)resultPtr[ 1 ], (int)resultPtr[ 2 ], (int)resultPtr[ 3 ] );
-                }
-                log_error( "ERROR: TEST FAILED: Clamping is erroneously returning border color!\n" );
-                return -1;
-            }
-            clampingErr = true;
-        }
-    }
-    if( !clampingErr )
-    {
-        /*        if( clamped && ( (int)x + (int)xOffsetValues[ j ] < 0 || (int)y + (int)yOffsetValues[ j ] < 0 ) )
-         {
-         log_error( "NEGATIVE COORDINATE ERROR\n" );
-         return -1;
-         }
-         */
-        if( true ) // gExtraValidateInfo )
-        {
-            if( printAsFloat )
-            {
-                log_error( "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not validate!\n\tExpected (%g,%g,%g,%g),\n\t     got (%g,%g,%g,%g), error of %g\n",
-                          j, x, x, y, y, z, z, (float)expected[ 0 ], (float)expected[ 1 ], (float)expected[ 2 ], (float)expected[ 3 ],
-                          (float)resultPtr[ 0 ], (float)resultPtr[ 1 ], (float)resultPtr[ 2 ], (float)resultPtr[ 3 ], error );
-            }
-            else
-            {
-                log_error( "Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not validate!\n\tExpected (%x,%x,%x,%x),\n\t     got (%x,%x,%x,%x)\n",
-                          j, x, x, y, y, z, z, (int)expected[ 0 ], (int)expected[ 1 ], (int)expected[ 2 ], (int)expected[ 3 ],
-                          (int)resultPtr[ 0 ], (int)resultPtr[ 1 ], (int)resultPtr[ 2 ], (int)resultPtr[ 3 ] );
-            }
-            log_error( "Integer coords resolve to %d,%d,%d   with img size %d,%d,%d\n", clampedX, clampedY, clampedZ, (int)imageWidth, (int)imageHeight, (int)imageDepth );
-
-            if( printAsFloat && gExtraValidateInfo )
-            {
-                log_error( "\nNearby values:\n" );
-                for( int zOff = -1; zOff <= 1; zOff++ )
-                {
-                    for( int yOff = -1; yOff <= 1; yOff++ )
-                    {
-                        float top[ 4 ], real[ 4 ], bot[ 4 ];
-                        read_image_pixel_float( imagePtr, imageInfo, clampedX - 1 , clampedY + yOff, clampedZ + zOff, top );
-                        read_image_pixel_float( imagePtr, imageInfo, clampedX ,clampedY + yOff, clampedZ + zOff, real );
-                        read_image_pixel_float( imagePtr, imageInfo, clampedX + 1, clampedY + yOff, clampedZ + zOff, bot );
-                        log_error( "\t(%g,%g,%g,%g)",top[0], top[1], top[2], top[3] );
-                        log_error( " (%g,%g,%g,%g)", real[0], real[1], real[2], real[3] );
-                        log_error( " (%g,%g,%g,%g)\n",bot[0], bot[1], bot[2], bot[3] );
-                    }
-                }
-            }
-            //        }
-            //        else
-            //            log_error( "\n" );
-            if( imageSampler->filter_mode != CL_FILTER_LINEAR )
-            {
-                if( found )
-                    log_error( "\tValue really found in image at %d,%d,%d (%s)\n", actualX, actualY, actualZ, ( found > 1 ) ? "NOT unique!!" : "unique" );
-                else
-                    log_error( "\tValue not actually found in image\n" );
-            }
-            log_error( "\n" );
-        }
-
-        numClamped = -1; // We force the clamped counter to never work
-        if( ( --numTries ) == 0 )
-            return -1;
-    }
-    return 0;
-}
-
-static void InitFloatCoords( image_descriptor *imageInfo, image_sampler_data *imageSampler, float *xOffsets, float *yOffsets, float *zOffsets, float xfract, float yfract, float zfract, int normalized_coords, MTdata d , int lod)
-{
-    size_t i = 0;
-    if( gDisableOffsets )
-    {
-        for( size_t z = 0; z < imageInfo->depth; z++ )
-        {
-            for( size_t y = 0; y < imageInfo->height; y++ )
-            {
-                for( size_t x = 0; x < imageInfo->width; x++, i++ )
-                {
-                    xOffsets[ i ] = (float) (xfract + (double) x);
-                    yOffsets[ i ] = (float) (yfract + (double) y);
-                    zOffsets[ i ] = (float) (zfract + (double) z);
-                }
-            }
-        }
-    }
-    else
-    {
-        for( size_t z = 0; z < imageInfo->depth; z++ )
-        {
-            for( size_t y = 0; y < imageInfo->height; y++ )
-            {
-                for( size_t x = 0; x < imageInfo->width; x++, i++ )
-                {
-                    xOffsets[ i ] = (float) (xfract + (double) ((int) x + random_in_range( -10, 10, d )));
-                    yOffsets[ i ] = (float) (yfract + (double) ((int) y + random_in_range( -10, 10, d )));
-                    zOffsets[ i ] = (float) (zfract + (double) ((int) z + random_in_range( -10, 10, d )));
-                }
-            }
-        }
-    }
-
-    if( imageSampler->addressing_mode == CL_ADDRESS_NONE )
-    {
-        i = 0;
-        for( size_t z = 0; z < imageInfo->depth; z++ )
-        {
-            for( size_t y = 0; y < imageInfo->height; y++ )
-            {
-                for( size_t x = 0; x < imageInfo->width; x++, i++ )
-                {
-                    xOffsets[ i ] = (float) CLAMP( (double) xOffsets[ i ], 0.0, (double) imageInfo->width - 1.0);
-                    yOffsets[ i ] = (float) CLAMP( (double) yOffsets[ i ], 0.0, (double) imageInfo->height - 1.0);
-                    zOffsets[ i ] = (float) CLAMP( (double) zOffsets[ i ], 0.0, (double) imageInfo->depth - 1.0);
-                }
-            }
-        }
-    }
-
-    if( normalized_coords || gTestMipmaps)
-    {
-        i = 0;
-        if (lod == 0)
-        {
-            for( size_t z = 0; z < imageInfo->depth; z++ )
-            {
-                for( size_t y = 0; y < imageInfo->height; y++ )
-                {
-                    for( size_t x = 0; x < imageInfo->width; x++, i++ )
-                    {
-                        xOffsets[ i ] = (float) ((double) xOffsets[ i ] / (double) imageInfo->width);
-                        yOffsets[ i ] = (float) ((double) yOffsets[ i ] / (double) imageInfo->height);
-                        zOffsets[ i ] = (float) ((double) zOffsets[ i ] / (double) imageInfo->depth);
-                    }
-                }
-            }
-        }
-        else if (gTestMipmaps)
-        {
-            size_t width_lod, height_lod, depth_lod;
-
-            width_lod = (imageInfo->width >> lod)?(imageInfo->width >> lod):1;
-            height_lod = (imageInfo->height >> lod)?(imageInfo->height >> lod):1;
-            depth_lod = (imageInfo->depth >> lod)?(imageInfo->depth >> lod):1;
-
-            for( size_t z = 0; z < depth_lod; z++ )
-            {
-                for( size_t y = 0; y < height_lod; y++ )
-                {
-                    for( size_t x = 0; x < width_lod; x++, i++ )
-                    {
-                        xOffsets[ i ] = (float) ((double) xOffsets[ i ] / (double) width_lod);
-                        yOffsets[ i ] = (float) ((double) yOffsets[ i ] / (double) height_lod);
-                        zOffsets[ i ] = (float) ((double) zOffsets[ i ] / (double) depth_lod);
-                    }
-                }
-            }
-        }
-    }
-}
-
-int test_read_image_3D( cl_context context, cl_command_queue queue, cl_kernel kernel,
-                       image_descriptor *imageInfo, image_sampler_data *imageSampler,
-                       bool useFloatCoords, ExplicitType outputType, MTdata d )
-{
-    int error;
-    size_t threads[3];
-    static int initHalf = 0;
-
-    cl_mem_flags    image_read_write_flags = CL_MEM_READ_ONLY;
-
-    clMemWrapper xOffsets, yOffsets, zOffsets, results;
-    clSamplerWrapper actualSampler;
-    BufferOwningPtr<char> maxImageUseHostPtrBackingStore;
-
-    // Create offset data
-    BufferOwningPtr<cl_float> xOffsetValues(malloc(sizeof(cl_float) *imageInfo->width * imageInfo->height * imageInfo->depth));
-    BufferOwningPtr<cl_float> yOffsetValues(malloc(sizeof(cl_float) *imageInfo->width * imageInfo->height * imageInfo->depth));
-    BufferOwningPtr<cl_float> zOffsetValues(malloc(sizeof(cl_float) *imageInfo->width * imageInfo->height * imageInfo->depth));
-
-    if( imageInfo->format->image_channel_data_type == CL_HALF_FLOAT )
-        if( DetectFloatToHalfRoundingMode(queue) )
-            return 1;
-
-    BufferOwningPtr<char> imageValues;
-    generate_random_image_data( imageInfo, imageValues, d );
-
-    // Construct testing sources
-    clProtectedImage protImage;
-    clMemWrapper unprotImage;
-    cl_mem image;
-
-    if(gtestTypesToRun & kReadTests)
-    {
-        image_read_write_flags = CL_MEM_READ_ONLY;
-    }
-    else
-    {
-        image_read_write_flags = CL_MEM_READ_WRITE;
-    }
-
-    if( gMemFlagsToUse == CL_MEM_USE_HOST_PTR )
-    {
-        // clProtectedImage uses USE_HOST_PTR, so just rely on that for the testing (via Ian)
-        // Do not use protected images for max image size test since it rounds the row size to a page size
-        if (gTestMaxImages) {
-            generate_random_image_data( imageInfo, maxImageUseHostPtrBackingStore, d );
-            unprotImage = create_image_3d(  context,
-                                            image_read_write_flags | CL_MEM_USE_HOST_PTR,
-                                            imageInfo->format,
-                                            imageInfo->width,
-                                            imageInfo->height,
-                                            imageInfo->depth,
-                                            ( gEnablePitch ? imageInfo->rowPitch : 0 ),
-                                            ( gEnablePitch ? imageInfo->slicePitch : 0 ), maxImageUseHostPtrBackingStore, &error );
-        } else {
-            error = protImage.Create(context, image_read_write_flags,
-                                     imageInfo->format, imageInfo->width,
-                                     imageInfo->height, imageInfo->depth);
-        }
-        if( error != CL_SUCCESS )
-        {
-            log_error( "ERROR: Unable to create 3D image of size %d x %d x %d (pitch %d, %d ) (%s)", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, IGetErrorString( error ) );
-            return error;
-        }
-        if (gTestMaxImages)
-            image = (cl_mem)unprotImage;
-        else
-            image = (cl_mem)protImage;
-    }
-    else if( gMemFlagsToUse == CL_MEM_COPY_HOST_PTR )
-    {
-        // Don't use clEnqueueWriteImage; just use copy host ptr to get the data in
-        unprotImage = create_image_3d( context,
-                                      image_read_write_flags | CL_MEM_COPY_HOST_PTR,
-                                      imageInfo->format,
-                                      imageInfo->width,
-                                      imageInfo->height,
-                                      imageInfo->depth,
-                                      ( gEnablePitch ? imageInfo->rowPitch : 0 ),
-                                      ( gEnablePitch ? imageInfo->slicePitch : 0 ),
-                                      imageValues, &error );
-        if( error != CL_SUCCESS )
-        {
-            log_error( "ERROR: Unable to create 3D image of size %d x %d x %d (pitch %d, %d ) (%s)", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, IGetErrorString( error ) );
-            return error;
-        }
-        image = unprotImage;
-    }
-    else // Either CL_MEM_ALLOC_HOST_PTR or none
-    {
-        // Note: if ALLOC_HOST_PTR is used, the driver allocates memory that can be accessed by the host, but otherwise
-        // it works just as if no flag is specified, so we just do the same thing either way
-        if ( !gTestMipmaps )
-        {
-            unprotImage = create_image_3d( context,
-                                          image_read_write_flags | gMemFlagsToUse,
-                                          imageInfo->format,
-                                          imageInfo->width, imageInfo->height, imageInfo->depth,
-                                          ( gEnablePitch ? imageInfo->rowPitch : 0 ),
-                                          ( gEnablePitch ? imageInfo->slicePitch : 0 ),
-                                          imageValues, &error );
-            if( error != CL_SUCCESS )
-            {
-                log_error( "ERROR: Unable to create 3D image of size %d x %d x %d (pitch %d, %d ) (%s)", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, IGetErrorString( error ) );
-                return error;
-            }
-            image = unprotImage;
-        }
-        else
-        {
-            cl_image_desc image_desc = {0};
-            image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
-            image_desc.image_width = imageInfo->width;
-            image_desc.image_height = imageInfo->height;
-            image_desc.image_depth = imageInfo->depth;
-            image_desc.num_mip_levels = imageInfo->num_mip_levels;
-
-
-            unprotImage = clCreateImage( context,
-                                        image_read_write_flags,
-                                        imageInfo->format, &image_desc, NULL, &error);
-            if( error != CL_SUCCESS )
-            {
-                log_error( "ERROR: Unable to create %d level mipmapped 3D image of size %d x %d x %d (pitch %d, %d ) (%s)",(int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, IGetErrorString( error ) );
-                return error;
-            }
-            image = unprotImage;
-        }
-    }
-
-    if( gMemFlagsToUse != CL_MEM_COPY_HOST_PTR )
-    {
-        size_t origin[ 4 ] = { 0, 0, 0, 0};
-        size_t region[ 3 ] = { imageInfo->width, imageInfo->height, imageInfo->depth };
-
-        if( gDebugTrace )
-            log_info( " - Writing image...\n" );
-
-        if ( !gTestMipmaps )
-        {
-
-            error = clEnqueueWriteImage(queue, image, CL_TRUE,
-                                        origin, region, gEnablePitch ? imageInfo->rowPitch : 0, gEnablePitch ? imageInfo->slicePitch : 0,
-                                        imageValues , 0, NULL, NULL);
-
-            if (error != CL_SUCCESS)
-            {
-                log_error( "ERROR: Unable to write to 3D image of size %d x %d x %d \n", (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth );
-                return error;
-            }
-        }
-        else
-        {
-            int nextLevelOffset = 0;
-
-            for (int i =0; i < imageInfo->num_mip_levels; i++)
-            {   origin[3] = i;
-                error = clEnqueueWriteImage(queue, image, CL_TRUE,
-                                            origin, region, /*gEnablePitch ? imageInfo->rowPitch :*/ 0, /*gEnablePitch ? imageInfo->slicePitch :*/ 0,
-                                            ((char*)imageValues + nextLevelOffset), 0, NULL, NULL);
-                if (error != CL_SUCCESS)
-                {
-                    log_error( "ERROR: Unable to write to %d level mipmapped 3D image of size %d x %d x %d\n", (int)imageInfo->num_mip_levels,(int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth );
-                    return error;
-                }
-                nextLevelOffset += region[0]*region[1]*region[2]*get_pixel_size(imageInfo->format);
-                //Subsequent mip level dimensions keep halving
-                region[0] = region[0] >> 1 ? region[0] >> 1 : 1;
-                region[1] = region[1] >> 1 ? region[1] >> 1 : 1;
-                region[2] = region[2] >> 1 ? region[2] >> 1 : 1;
-            }
-        }
-    }
-
-    xOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                              sizeof(cl_float) * imageInfo->width
-                                  * imageInfo->height * imageInfo->depth,
-                              xOffsetValues, &error);
-    test_error( error, "Unable to create x offset buffer" );
-    yOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                              sizeof(cl_float) * imageInfo->width
-                                  * imageInfo->height * imageInfo->depth,
-                              yOffsetValues, &error);
-    test_error( error, "Unable to create y offset buffer" );
-    zOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                              sizeof(cl_float) * imageInfo->width
-                                  * imageInfo->height * imageInfo->depth,
-                              zOffsetValues, &error);
-    test_error( error, "Unable to create y offset buffer" );
-    results =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       get_explicit_type_size(outputType) * 4 * imageInfo->width
-                           * imageInfo->height * imageInfo->depth,
-                       NULL, &error);
-    test_error( error, "Unable to create result buffer" );
-
-    // Create sampler to use
-    actualSampler = create_sampler(context, imageSampler, gTestMipmaps, &error);
-    test_error(error, "Unable to create image sampler");
-
-    // Set arguments
-    int idx = 0;
-    error = clSetKernelArg( kernel, idx++, sizeof( cl_mem ), &image );
-    test_error( error, "Unable to set kernel arguments" );
-    if( !gUseKernelSamplers )
-    {
-        error = clSetKernelArg( kernel, idx++, sizeof( cl_sampler ), &actualSampler );
-        test_error( error, "Unable to set kernel arguments" );
-    }
-    error = clSetKernelArg( kernel, idx++, sizeof( cl_mem ), &xOffsets );
-    test_error( error, "Unable to set kernel arguments" );
-    error = clSetKernelArg( kernel, idx++, sizeof( cl_mem ), &yOffsets );
-    test_error( error, "Unable to set kernel arguments" );
-    error = clSetKernelArg( kernel, idx++, sizeof( cl_mem ), &zOffsets );
-    test_error( error, "Unable to set kernel arguments" );
-    error = clSetKernelArg( kernel, idx++, sizeof( cl_mem ), &results );
-    test_error( error, "Unable to set kernel arguments" );
-
-    const float float_offsets[] = { 0.0f, MAKE_HEX_FLOAT(0x1.0p-30f, 0x1L, -30), 0.25f, 0.3f, 0.5f - FLT_EPSILON/4.0f, 0.5f, 0.9f, 1.0f - FLT_EPSILON/2 };
-    int float_offset_count = sizeof( float_offsets) / sizeof( float_offsets[0] );
-    int numTries = MAX_TRIES, numClamped = MAX_CLAMPED;
-    int loopCount = 2 * float_offset_count;
-    if( ! useFloatCoords )
-        loopCount = 1;
-    if (gTestMaxImages) {
-        loopCount = 1;
-        log_info("Testing each size only once with pixel offsets of %g for max sized images.\n", float_offsets[0]);
-    }
-
-    // Get the maximum absolute error for this format
-    double formatAbsoluteError = get_max_absolute_error(imageInfo->format, imageSampler);
-    if (gDebugTrace) log_info("\tformatAbsoluteError is %e\n", formatAbsoluteError);
-
-    if (0 == initHalf && imageInfo->format->image_channel_data_type == CL_HALF_FLOAT ) {
-        initHalf = CL_SUCCESS == DetectFloatToHalfRoundingMode( queue );
-        if (initHalf) {
-            log_info("Half rounding mode successfully detected.\n");
-        }
-    }
-
-    int nextLevelOffset = 0;
-    size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth;
-
-    //Loop over all mipmap levels, if we are testing mipmapped images.
-    for(int lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++)
-    {
-        size_t resultValuesSize = width_lod * height_lod * depth_lod * get_explicit_type_size( outputType ) * 4;
-        BufferOwningPtr<char> resultValues(malloc( resultValuesSize ));
-        float lod_float = (float)lod;
-        if (gTestMipmaps) {
-            //Set the lod kernel arg
-            if(gDebugTrace)
-                log_info(" - Working at mip level %d\n", lod);
-            error = clSetKernelArg( kernel, idx, sizeof( float ), &lod_float);
-            test_error( error, "Unable to set kernel arguments" );
-        }
-
-    for( int q = 0; q < loopCount; q++ )
-    {
-        float offset = float_offsets[ q % float_offset_count ];
-
-        // Init the coordinates
-        InitFloatCoords( imageInfo, imageSampler, xOffsetValues, yOffsetValues, zOffsetValues,
-                        q>=float_offset_count ? -offset: offset,
-                        q>=float_offset_count ? offset: -offset,
-                        q>=float_offset_count ? -offset: offset,
-                        imageSampler->normalized_coords, d, lod );
-
-        error = clEnqueueWriteBuffer( queue, xOffsets, CL_TRUE, 0, sizeof(cl_float) * imageInfo->height * imageInfo->width * imageInfo->depth, xOffsetValues, 0, NULL, NULL );
-        test_error( error, "Unable to write x offsets" );
-        error = clEnqueueWriteBuffer( queue, yOffsets, CL_TRUE, 0, sizeof(cl_float) * imageInfo->height * imageInfo->width * imageInfo->depth, yOffsetValues, 0, NULL, NULL );
-        test_error( error, "Unable to write y offsets" );
-        error = clEnqueueWriteBuffer( queue, zOffsets, CL_TRUE, 0, sizeof(cl_float) * imageInfo->height * imageInfo->width * imageInfo->depth, zOffsetValues, 0, NULL, NULL );
-        test_error( error, "Unable to write z offsets" );
-
-
-        memset( resultValues, 0xff, resultValuesSize );
-        clEnqueueWriteBuffer( queue, results, CL_TRUE, 0, resultValuesSize, resultValues, 0, NULL, NULL );
-
-        // Figure out thread dimensions
-        threads[0] = (size_t)width_lod;
-        threads[1] = (size_t)height_lod;
-        threads[2] = (size_t)depth_lod;
-
-        // Run the kernel
-        error = clEnqueueNDRangeKernel( queue, kernel, 3, NULL, threads, NULL, 0, NULL, NULL );
-        test_error( error, "Unable to run kernel" );
-
-        // Get results
-        error = clEnqueueReadBuffer( queue, results, CL_TRUE, 0, width_lod * height_lod * depth_lod * get_explicit_type_size( outputType ) * 4, resultValues, 0, NULL, NULL );
-        test_error( error, "Unable to read results from kernel" );
-        if( gDebugTrace )
-            log_info( "    results read\n" );
-
-        // Validate results element by element
-        char *imagePtr = (char*)imageValues + nextLevelOffset;
-        /*
-         * FLOAT output type
-         */
-        if(is_sRGBA_order(imageInfo->format->image_channel_order) && (outputType == kFloat) )
-        {
-            // Validate float results
-            float *resultPtr = (float *)(char *)resultValues;
-            float expected[4], error=0.0f;
-            float maxErr = get_max_relative_error( imageInfo->format, imageSampler, 1 /*3D*/, CL_FILTER_LINEAR == imageSampler->filter_mode );
-
-            for( size_t z = 0, j = 0; z < depth_lod; z++ )
-            {
-                for( size_t y = 0; y < height_lod; y++ )
-                {
-                    for( size_t x = 0; x < width_lod; x++, j++ )
-                    {
-                        // Step 1: go through and see if the results verify for the pixel
-                        // For the normalized case on a GPU we put in offsets to the X, Y and Z to see if we land on the
-                        // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                        int checkOnlyOnePixel = 0;
-                        int found_pixel = 0;
-                        float offset = NORM_OFFSET;
-                        if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
-#if defined( __APPLE__ )
-                            // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                            || gDeviceType != CL_DEVICE_TYPE_GPU
-#endif
-                            )
-                            offset = 0.0f;          // Loop only once
-
-                        for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) {
-                            for (float norm_offset_y = -offset; norm_offset_y <= offset && !found_pixel ; norm_offset_y += NORM_OFFSET) {
-                                for (float norm_offset_z = -offset; norm_offset_z <= NORM_OFFSET && !found_pixel; norm_offset_z += NORM_OFFSET) {
-
-                                    int hasDenormals = 0;
-                                    FloatPixel maxPixel = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                          xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                          norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                          imageSampler, expected, 0, &hasDenormals, lod );
-
-                                    float err1 =
-                                        ABS_ERROR(sRGBmap(resultPtr[0]),
-                                                  sRGBmap(expected[0]));
-                                    float err2 =
-                                        ABS_ERROR(sRGBmap(resultPtr[1]),
-                                                  sRGBmap(expected[1]));
-                                    float err3 =
-                                        ABS_ERROR(sRGBmap(resultPtr[2]),
-                                                  sRGBmap(expected[2]));
-                                    float err4 =
-                                        ABS_ERROR(resultPtr[3], expected[3]);
-                                    // Clamp to the minimum absolute error for the format
-                                    if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; }
-                                    if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
-                                    if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
-                                    if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                                    float maxErr = 0.5;
-
-                                    if( ! (err1 <= maxErr) || ! (err2 <= maxErr)    || ! (err3 <= maxErr) || ! (err4 <= maxErr) )
-                                    {
-                                        // Try flushing the denormals
-                                        if( hasDenormals )
-                                        {
-                                            // If implementation decide to flush subnormals to zero,
-                                            // max error needs to be adjusted
-                                              maxErr += 4 * FLT_MIN;
-
-                                            maxPixel = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                       xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                       norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                       imageSampler, expected, 0, NULL, lod );
-
-                                            err1 =
-                                                ABS_ERROR(sRGBmap(resultPtr[0]),
-                                                          sRGBmap(expected[0]));
-                                            err2 =
-                                                ABS_ERROR(sRGBmap(resultPtr[1]),
-                                                          sRGBmap(expected[1]));
-                                            err3 =
-                                                ABS_ERROR(sRGBmap(resultPtr[2]),
-                                                          sRGBmap(expected[2]));
-                                            err4 = ABS_ERROR(resultPtr[3],
-                                                             expected[3]);
-                                        }
-                                    }
-
-                                    found_pixel = (err1 <= maxErr) && (err2 <= maxErr)  && (err3 <= maxErr) && (err4 <= maxErr);
-                                }//norm_offset_z
-                            }//norm_offset_y
-                        }//norm_offset_x
-
-                        // Step 2: If we did not find a match, then print out debugging info.
-                        if (!found_pixel) {
-                            // For the normalized case on a GPU we put in offsets to the X and Y to see if we land on the
-                            // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                            checkOnlyOnePixel = 0;
-                            int shouldReturn = 0;
-                            for (float norm_offset_x = -offset; norm_offset_x <= offset && !checkOnlyOnePixel; norm_offset_x += NORM_OFFSET) {
-                                for (float norm_offset_y = -offset; norm_offset_y <= offset && !checkOnlyOnePixel; norm_offset_y += NORM_OFFSET) {
-                                    for (float norm_offset_z = -offset; norm_offset_z <= offset && !checkOnlyOnePixel; norm_offset_z += NORM_OFFSET) {
-
-                                        int hasDenormals = 0;
-                                        FloatPixel maxPixel = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                              xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                              norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                              imageSampler, expected, 0, &hasDenormals, lod );
-
-                                        float err1 =
-                                            ABS_ERROR(sRGBmap(resultPtr[0]),
-                                                      sRGBmap(expected[0]));
-                                        float err2 =
-                                            ABS_ERROR(sRGBmap(resultPtr[1]),
-                                                      sRGBmap(expected[1]));
-                                        float err3 =
-                                            ABS_ERROR(sRGBmap(resultPtr[2]),
-                                                      sRGBmap(expected[2]));
-                                        float err4 = ABS_ERROR(resultPtr[3],
-                                                               expected[3]);
-                                        float maxErr = 0.6;
-
-                                        if( ! (err1 <= maxErr) || ! (err2 <= maxErr)    || ! (err3 <= maxErr) || ! (err4 <= maxErr) )
-                                        {
-                                            // Try flushing the denormals
-                                            if( hasDenormals )
-                                            {
-                                                // If implementation decide to flush subnormals to zero,
-                                                // max error needs to be adjusted
-                                                  maxErr += 4 * FLT_MIN;
-
-                                                maxPixel = sample_image_pixel_float( imagePtr, imageInfo,
-                                                                                    xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                    imageSampler, expected, 0, NULL, lod );
-
-                                                err1 = ABS_ERROR(
-                                                    sRGBmap(resultPtr[0]),
-                                                    sRGBmap(expected[0]));
-                                                err2 = ABS_ERROR(
-                                                    sRGBmap(resultPtr[1]),
-                                                    sRGBmap(expected[1]));
-                                                err3 = ABS_ERROR(
-                                                    sRGBmap(resultPtr[2]),
-                                                    sRGBmap(expected[2]));
-                                                err4 = ABS_ERROR(resultPtr[3],
-                                                                 expected[3]);
-                                            }
-                                        }
-
-                                        if( ! (err1 <= maxErr) || ! (err2 <= maxErr)    || ! (err3 <= maxErr) || ! (err4 <= maxErr) )
-                                        {
-                                            log_error("FAILED norm_offsets: %g , %g , %g:\n", norm_offset_x, norm_offset_y, norm_offset_z);
-
-                                            float tempOut[4];
-                                            shouldReturn |= determine_validation_error_offset<float>( imagePtr, imageInfo, imageSampler, resultPtr,
-                                                                                                     expected, error, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j],
-                                                                                                     norm_offset_x, norm_offset_y, norm_offset_z, j,
-                                                                                                     numTries, numClamped, true, lod );
-                                            log_error( "Step by step:\n" );
-                                            FloatPixel temp = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                              xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                              norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                              imageSampler, tempOut, 1 /*verbose*/, &hasDenormals, lod);
-                                            log_error( "\tulps: %2.2f, %2.2f, %2.2f, %2.2f  (max allowed: %2.2f)\n\n",
-                                                      Ulp_Error( resultPtr[0], expected[0] ),
-                                                      Ulp_Error( resultPtr[1], expected[1] ),
-                                                      Ulp_Error( resultPtr[2], expected[2] ),
-                                                      Ulp_Error( resultPtr[3], expected[3] ),
-                                                      Ulp_Error( MAKE_HEX_FLOAT(0x1.000002p0f, 0x1000002L, -24) + maxErr, MAKE_HEX_FLOAT(0x1.000002p0f, 0x1000002L, -24) ) );
-                                        } else {
-                                            log_error("Test error: we should have detected this passing above.\n");
-                                        }
-                                    }//norm_offset_z
-                                }//norm_offset_y
-                            }//norm_offset_x
-                            if( shouldReturn )
-                                return 1;
-                        } // if (!found_pixel)
-
-                        resultPtr += 4;
-                    }
-                }
-            }
-        }
-        /*
-         * FLOAT output type
-         */
-        else if( outputType == kFloat )
-        {
-            // Validate float results
-            float *resultPtr = (float *)(char *)resultValues;
-            float expected[4], error=0.0f;
-            float maxErr = get_max_relative_error( imageInfo->format, imageSampler, 1 /*3D*/, CL_FILTER_LINEAR == imageSampler->filter_mode );
-
-            for( size_t z = 0, j = 0; z < depth_lod; z++ )
-            {
-                for( size_t y = 0; y < height_lod; y++ )
-                {
-                    for( size_t x = 0; x < width_lod; x++, j++ )
-                    {
-                        // Step 1: go through and see if the results verify for the pixel
-                        // For the normalized case on a GPU we put in offsets to the X, Y and Z to see if we land on the
-                        // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                        int checkOnlyOnePixel = 0;
-                        int found_pixel = 0;
-                        float offset = NORM_OFFSET;
-                        if (!imageSampler->normalized_coords ||  imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0
-#if defined( __APPLE__ )
-                            // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes
-                            || gDeviceType != CL_DEVICE_TYPE_GPU
-#endif
-                            )
-                            offset = 0.0f;          // Loop only once
-
-                        for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) {
-                            for (float norm_offset_y = -offset; norm_offset_y <= offset && !found_pixel ; norm_offset_y += NORM_OFFSET) {
-                                for (float norm_offset_z = -offset; norm_offset_z <= NORM_OFFSET && !found_pixel; norm_offset_z += NORM_OFFSET) {
-
-                                    int hasDenormals = 0;
-                                    FloatPixel maxPixel = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                          xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                          norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                          imageSampler, expected, 0, &hasDenormals, lod );
-
-                                    float err1 =
-                                        ABS_ERROR(resultPtr[0], expected[0]);
-                                    float err2 =
-                                        ABS_ERROR(resultPtr[1], expected[1]);
-                                    float err3 =
-                                        ABS_ERROR(resultPtr[2], expected[2]);
-                                    float err4 =
-                                        ABS_ERROR(resultPtr[3], expected[3]);
-                                    // Clamp to the minimum absolute error for the format
-                                    if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; }
-                                    if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
-                                    if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
-                                    if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                                    float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                    float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                    float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                    float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
-
-                                    if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) )
-                                    {
-                                        // Try flushing the denormals
-                                        if( hasDenormals )
-                                        {
-                                            // If implementation decide to flush subnormals to zero,
-                                            // max error needs to be adjusted
-                                              maxErr1 += 4 * FLT_MIN;
-                                            maxErr2 += 4 * FLT_MIN;
-                                            maxErr3 += 4 * FLT_MIN;
-                                            maxErr4 += 4 * FLT_MIN;
-
-                                            maxPixel = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                       xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                       norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                       imageSampler, expected, 0, NULL, lod );
-
-                                            err1 = ABS_ERROR(resultPtr[0],
-                                                             expected[0]);
-                                            err2 = ABS_ERROR(resultPtr[1],
-                                                             expected[1]);
-                                            err3 = ABS_ERROR(resultPtr[2],
-                                                             expected[2]);
-                                            err4 = ABS_ERROR(resultPtr[3],
-                                                             expected[3]);
-                                        }
-                                    }
-
-                                    found_pixel = (err1 <= maxErr1) && (err2 <= maxErr2)  && (err3 <= maxErr3) && (err4 <= maxErr4);
-                                }//norm_offset_z
-                            }//norm_offset_y
-                        }//norm_offset_x
-
-                        // Step 2: If we did not find a match, then print out debugging info.
-                        if (!found_pixel) {
-                            // For the normalized case on a GPU we put in offsets to the X and Y to see if we land on the
-                            // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                            checkOnlyOnePixel = 0;
-                            int shouldReturn = 0;
-                            for (float norm_offset_x = -offset; norm_offset_x <= offset && !checkOnlyOnePixel; norm_offset_x += NORM_OFFSET) {
-                                for (float norm_offset_y = -offset; norm_offset_y <= offset && !checkOnlyOnePixel; norm_offset_y += NORM_OFFSET) {
-                                    for (float norm_offset_z = -offset; norm_offset_z <= offset && !checkOnlyOnePixel; norm_offset_z += NORM_OFFSET) {
-
-                                        int hasDenormals = 0;
-                                        FloatPixel maxPixel = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                              xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                              norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                              imageSampler, expected, 0, &hasDenormals, lod );
-
-                                        float err1 = ABS_ERROR(resultPtr[0],
-                                                               expected[0]);
-                                        float err2 = ABS_ERROR(resultPtr[1],
-                                                               expected[1]);
-                                        float err3 = ABS_ERROR(resultPtr[2],
-                                                               expected[2]);
-                                        float err4 = ABS_ERROR(resultPtr[3],
-                                                               expected[3]);
-                                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                        float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                        float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                        float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
-
-
-                                        if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) )
-                                        {
-                                            // Try flushing the denormals
-                                            if( hasDenormals )
-                                            {
-                                                  maxErr1 += 4 * FLT_MIN;
-                                                maxErr2 += 4 * FLT_MIN;
-                                                maxErr3 += 4 * FLT_MIN;
-                                                maxErr4 += 4 * FLT_MIN;
-
-                                                maxPixel = sample_image_pixel_float( imagePtr, imageInfo,
-                                                                                    xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                    imageSampler, expected, 0, NULL, lod );
-
-                                                err1 = ABS_ERROR(resultPtr[0],
-                                                                 expected[0]);
-                                                err2 = ABS_ERROR(resultPtr[1],
-                                                                 expected[1]);
-                                                err3 = ABS_ERROR(resultPtr[2],
-                                                                 expected[2]);
-                                                err4 = ABS_ERROR(resultPtr[3],
-                                                                 expected[3]);
-                                            }
-                                        }
-
-                                        if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) )
-                                        {
-                                            log_error("FAILED norm_offsets: %g , %g , %g:\n", norm_offset_x, norm_offset_y, norm_offset_z);
-
-                                            float tempOut[4];
-                                            shouldReturn |= determine_validation_error_offset<float>( imagePtr, imageInfo, imageSampler, resultPtr,
-                                                                                                     expected, error, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j],
-                                                                                                     norm_offset_x, norm_offset_y, norm_offset_z, j,
-                                                                                                     numTries, numClamped, true, lod );
-                                            log_error( "Step by step:\n" );
-                                            FloatPixel temp = sample_image_pixel_float_offset( imagePtr, imageInfo,
-                                                                                              xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                              norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                              imageSampler, tempOut, 1 /*verbose*/, &hasDenormals, lod);
-                                            log_error( "\tulps: %2.2f, %2.2f, %2.2f, %2.2f  (max allowed: %2.2f)\n\n",
-                                                      Ulp_Error( resultPtr[0], expected[0] ),
-                                                      Ulp_Error( resultPtr[1], expected[1] ),
-                                                      Ulp_Error( resultPtr[2], expected[2] ),
-                                                      Ulp_Error( resultPtr[3], expected[3] ),
-                                                      Ulp_Error( MAKE_HEX_FLOAT(0x1.000002p0f, 0x1000002L, -24) + maxErr, MAKE_HEX_FLOAT(0x1.000002p0f, 0x1000002L, -24) ) );
-                                        } else {
-                                            log_error("Test error: we should have detected this passing above.\n");
-                                        }
-                                    }//norm_offset_z
-                                }//norm_offset_y
-                            }//norm_offset_x
-                            if( shouldReturn )
-                                return 1;
-                        } // if (!found_pixel)
-
-                        resultPtr += 4;
-                    }
-                }
-            }
-        }
-        /*
-         * UINT output type
-         */
-        else if( outputType == kUInt )
-        {
-            // Validate unsigned integer results
-            unsigned int *resultPtr = (unsigned int *)(char *)resultValues;
-            unsigned int expected[4];
-            float error;
-            for( size_t z = 0, j = 0; z < depth_lod; z++ )
-            {
-                for( size_t y = 0; y < height_lod; y++ )
-                {
-                    for( size_t x = 0; x < width_lod; x++, j++ )
-                    {
-                        // Step 1: go through and see if the results verify for the pixel
-                        // For the normalized case on a GPU we put in offsets to the X, Y and Z to see if we land on the
-                        // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                        int checkOnlyOnePixel = 0;
-                        int found_pixel = 0;
-                        for (float norm_offset_x = -NORM_OFFSET; norm_offset_x <= NORM_OFFSET && !found_pixel && !checkOnlyOnePixel; norm_offset_x += NORM_OFFSET) {
-                            for (float norm_offset_y = -NORM_OFFSET; norm_offset_y <= NORM_OFFSET && !found_pixel && !checkOnlyOnePixel; norm_offset_y += NORM_OFFSET) {
-                                for (float norm_offset_z = -NORM_OFFSET; norm_offset_z <= NORM_OFFSET && !found_pixel && !checkOnlyOnePixel; norm_offset_z += NORM_OFFSET) {
-
-                                    // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
-                                    // E.g., test one pixel.
-                                    if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
-                                        norm_offset_x = 0.0f;
-                                        norm_offset_y = 0.0f;
-                                        norm_offset_z = 0.0f;
-                                        checkOnlyOnePixel = 1;
-                                    }
-
-                                    sample_image_pixel_offset<unsigned int>( imagePtr, imageInfo,
-                                                                            xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                            norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                            imageSampler, expected, lod );
-
-                                    error = errMax( errMax( abs_diff_uint(expected[ 0 ], resultPtr[ 0 ]), abs_diff_uint(expected[ 1 ], resultPtr[ 1 ]) ),
-                                                   errMax( abs_diff_uint(expected[ 2 ], resultPtr[ 2 ]), abs_diff_uint(expected[ 3 ], resultPtr[ 3 ]) ) );
-
-                                    if (error < MAX_ERR)
-                                        found_pixel = 1;
-                                }//norm_offset_z
-                            }//norm_offset_y
-                        }//norm_offset_x
-
-                        // Step 2: If we did not find a match, then print out debugging info.
-                        if (!found_pixel) {
-                            // For the normalized case on a GPU we put in offsets to the X and Y to see if we land on the
-                            // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                            checkOnlyOnePixel = 0;
-                            int shouldReturn = 0;
-                            for (float norm_offset_x = -NORM_OFFSET; norm_offset_x <= NORM_OFFSET && !checkOnlyOnePixel; norm_offset_x += NORM_OFFSET) {
-                                for (float norm_offset_y = -NORM_OFFSET; norm_offset_y <= NORM_OFFSET && !checkOnlyOnePixel; norm_offset_y += NORM_OFFSET) {
-                                    for (float norm_offset_z = -NORM_OFFSET; norm_offset_z <= NORM_OFFSET && !checkOnlyOnePixel; norm_offset_z += NORM_OFFSET) {
-
-                                        // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
-                                        // E.g., test one pixel.
-                                        if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
-                                            norm_offset_x = 0.0f;
-                                            norm_offset_y = 0.0f;
-                                            norm_offset_z = 0.0f;
-                                            checkOnlyOnePixel = 1;
-                                        }
-
-                                        sample_image_pixel_offset<unsigned int>( imagePtr, imageInfo,
-                                                                                xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                                norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                imageSampler, expected, lod );
-
-                                        error = errMax( errMax( abs_diff_uint(expected[ 0 ], resultPtr[ 0 ]), abs_diff_uint(expected[ 1 ], resultPtr[ 1 ]) ),
-                                                       errMax( abs_diff_uint(expected[ 2 ], resultPtr[ 2 ]), abs_diff_uint(expected[ 3 ], resultPtr[ 3 ]) ) );
-
-                                        if( error > MAX_ERR )
-                                        {
-                                            log_error("FAILED norm_offsets: %g , %g , %g:\n", norm_offset_x, norm_offset_y, norm_offset_z);
-                                            shouldReturn |=  determine_validation_error_offset<unsigned int>( imagePtr, imageInfo, imageSampler, resultPtr,
-                                                                                                             expected, error, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j],
-                                                                                                             norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                                             j, numTries, numClamped, false, lod );
-                                        } else {
-                                            log_error("Test error: we should have detected this passing above.\n");
-                                        }
-                                    }//norm_offset_z
-                                }//norm_offset_y
-                            }//norm_offset_x
-                            if( shouldReturn )
-                                return 1;
-                        } // if (!found_pixel)
-
-                        resultPtr += 4;
-                    }
-                }
-            }
-        }
-        else
-        /*
-         * INT output type
-         */
-        {
-            // Validate integer results
-            int *resultPtr = (int *)(char *)resultValues;
-            int expected[4];
-            float error;
-            for( size_t z = 0, j = 0; z < depth_lod; z++ )
-            {
-                for( size_t y = 0; y < height_lod; y++ )
-                {
-                    for( size_t x = 0; x < width_lod; x++, j++ )
-                    {
-                        // Step 1: go through and see if the results verify for the pixel
-                        // For the normalized case on a GPU we put in offsets to the X, Y and Z to see if we land on the
-                        // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                        int checkOnlyOnePixel = 0;
-                        int found_pixel = 0;
-                        for (float norm_offset_x = -NORM_OFFSET; norm_offset_x <= NORM_OFFSET && !found_pixel && !checkOnlyOnePixel; norm_offset_x += NORM_OFFSET) {
-                            for (float norm_offset_y = -NORM_OFFSET; norm_offset_y <= NORM_OFFSET && !found_pixel && !checkOnlyOnePixel; norm_offset_y += NORM_OFFSET) {
-                                for (float norm_offset_z = -NORM_OFFSET; norm_offset_z <= NORM_OFFSET && !found_pixel && !checkOnlyOnePixel; norm_offset_z += NORM_OFFSET) {
-
-                                    // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
-                                    // E.g., test one pixel.
-                                    if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) {
-                                        norm_offset_x = 0.0f;
-                                        norm_offset_y = 0.0f;
-                                        norm_offset_z = 0.0f;
-                                        checkOnlyOnePixel = 1;
-                                    }
-
-                                    sample_image_pixel_offset<int>( imagePtr, imageInfo,
-                                                                   xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                   norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                   imageSampler, expected, lod );
-
-                                    error = errMax( errMax( abs_diff_int(expected[ 0 ], resultPtr[ 0 ]), abs_diff_int(expected[ 1 ], resultPtr[ 1 ]) ),
-                                                   errMax( abs_diff_int(expected[ 2 ], resultPtr[ 2 ]), abs_diff_int(expected[ 3 ], resultPtr[ 3 ]) ) );
-
-                                    if (error < MAX_ERR)
-                                        found_pixel = 1;
-                                }//norm_offset_z
-                            }//norm_offset_y
-                        }//norm_offset_x
-
-                        // Step 2: If we did not find a match, then print out debugging info.
-                        if (!found_pixel) {
-                            // For the normalized case on a GPU we put in offsets to the X and Y to see if we land on the
-                            // right pixel. This addresses the significant inaccuracy in GPU normalization in OpenCL 1.0.
-                            checkOnlyOnePixel = 0;
-                            int shouldReturn = 0;
-                            for (float norm_offset_x = -NORM_OFFSET; norm_offset_x <= NORM_OFFSET && !checkOnlyOnePixel; norm_offset_x += NORM_OFFSET) {
-                                for (float norm_offset_y = -NORM_OFFSET; norm_offset_y <= NORM_OFFSET && !checkOnlyOnePixel; norm_offset_y += NORM_OFFSET) {
-                                    for (float norm_offset_z = -NORM_OFFSET; norm_offset_z <= NORM_OFFSET && !checkOnlyOnePixel; norm_offset_z += NORM_OFFSET) {
-
-                                        // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0)
-                                        // E.g., test one pixel.
-                                        if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0 || NORM_OFFSET == 0 || NORM_OFFSET == 0) {
-                                            norm_offset_x = 0.0f;
-                                            norm_offset_y = 0.0f;
-                                            norm_offset_z = 0.0f;
-                                            checkOnlyOnePixel = 1;
-                                        }
-
-                                        sample_image_pixel_offset<int>( imagePtr, imageInfo,
-                                                                       xOffsetValues[ j ], yOffsetValues[ j ], zOffsetValues[ j ],
-                                                                       norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                       imageSampler, expected, lod );
-
-                                        error = errMax( errMax( abs_diff_int(expected[ 0 ], resultPtr[ 0 ]), abs_diff_int(expected[ 1 ], resultPtr[ 1 ]) ),
-                                                       errMax( abs_diff_int(expected[ 2 ], resultPtr[ 2 ]), abs_diff_int(expected[ 3 ], resultPtr[ 3 ]) ) );
-
-                                        if( error > MAX_ERR )
-                                        {
-                                            log_error("FAILED norm_offsets: %g , %g , %g:\n", norm_offset_x, norm_offset_y, norm_offset_z);
-                                            shouldReturn |=  determine_validation_error_offset<int>( imagePtr, imageInfo, imageSampler, resultPtr,
-                                                                                                    expected, error, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j],
-                                                                                                    norm_offset_x, norm_offset_y, norm_offset_z,
-                                                                                                    j, numTries, numClamped, false, lod );
-                                        } else {
-                                            log_error("Test error: we should have detected this passing above.\n");
-                                        }
-                                    }//norm_offset_z
-                                }//norm_offset_y
-                            }//norm_offset_x
-                            if( shouldReturn )
-                                return 1;
-                        } // if (!found_pixel)
-
-                        resultPtr += 4;
-                        }
-                    }
-                }
-            }
-        }
-        {
-            nextLevelOffset += width_lod * height_lod * depth_lod * get_pixel_size(imageInfo->format);
-            width_lod = ( width_lod >> 1) ?( width_lod >> 1) : 1;
-            height_lod = ( height_lod >> 1) ?( height_lod >> 1) : 1;
-            depth_lod = ( depth_lod >> 1) ?( depth_lod >> 1) : 1;
-        }
-    }
-
-    return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
-}
 
 int test_read_image_set_3D(cl_device_id device, cl_context context,
                            cl_command_queue queue,
@@ -1253,7 +169,9 @@ int test_read_image_set_3D(cl_device_id device, cl_context context,
 
                     if( gDebugTrace )
                         log_info( "   at size %d,%d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.depth );
-                    int retCode = test_read_image_3D( context, queue, kernel, &imageInfo, imageSampler, floatCoords, outputType, seed );
+                    int retCode = test_read_image(
+                        context, queue, kernel, &imageInfo, imageSampler,
+                        floatCoords, outputType, seed);
                     if( retCode )
                         return retCode;
                 }
@@ -1280,7 +198,9 @@ int test_read_image_set_3D(cl_device_id device, cl_context context,
             log_info("Testing %d x %d x %d\n", (int)sizes[ idx ][ 0 ], (int)sizes[ idx ][ 1 ], (int)sizes[ idx ][ 2 ]);
             if( gDebugTrace )
                 log_info( "   at max size %d,%d,%d\n", (int)sizes[ idx ][ 0 ], (int)sizes[ idx ][ 1 ], (int)sizes[ idx ][ 2 ] );
-            int retCode = test_read_image_3D( context, queue, kernel, &imageInfo, imageSampler, floatCoords, outputType, seed );
+            int retCode =
+                test_read_image(context, queue, kernel, &imageInfo,
+                                imageSampler, floatCoords, outputType, seed);
             if( retCode )
                 return retCode;
         }
@@ -1294,7 +214,9 @@ int test_read_image_set_3D(cl_device_id device, cl_context context,
 
         imageInfo.rowPitch = imageInfo.width * get_pixel_size( imageInfo.format );
         imageInfo.slicePitch = imageInfo.height * imageInfo.rowPitch;
-        int retCode = test_read_image_3D( context, queue, kernel, &imageInfo, imageSampler, floatCoords, outputType, seed );
+        int retCode =
+            test_read_image(context, queue, kernel, &imageInfo, imageSampler,
+                            floatCoords, outputType, seed);
         if( retCode )
             return retCode;
     }
@@ -1344,7 +266,9 @@ int test_read_image_set_3D(cl_device_id device, cl_context context,
                 if ( gTestMipmaps )
                     log_info( "   and number of mip levels :%d\n", (int)imageInfo.num_mip_levels );
             }
-            int retCode = test_read_image_3D( context, queue, kernel, &imageInfo, imageSampler, floatCoords, outputType, seed );
+            int retCode =
+                test_read_image(context, queue, kernel, &imageInfo,
+                                imageSampler, floatCoords, outputType, seed);
             if( retCode )
                 return retCode;
         }

From 216455842dc7763a2a18d32ba94c18e681c250c5 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Tue, 2 Feb 2021 17:43:37 +0100
Subject: [PATCH 033/158] Change arg type to unsigned int from signed int
 (#1078)

* Change arg type to unsigned int from signed int

* Fix formatting issues
---
 .../test_fine_grain_memory_consistency.cpp    |  50 +-
 .../SVM/test_fine_grain_sync_buffers.cpp      |  24 +-
 .../device_execution/enqueue_ndrange.cpp      | 539 ++++++++++--------
 3 files changed, 334 insertions(+), 279 deletions(-)

diff --git a/test_conformance/SVM/test_fine_grain_memory_consistency.cpp b/test_conformance/SVM/test_fine_grain_memory_consistency.cpp
index 42ea0bd2ad..b28db41190 100644
--- a/test_conformance/SVM/test_fine_grain_memory_consistency.cpp
+++ b/test_conformance/SVM/test_fine_grain_memory_consistency.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,27 +16,33 @@
 #include "common.h"
 
 static char hash_table_kernel[] =
-  "#if 0\n"
-  "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
-  "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
-  "#endif\n"
-  "typedef struct BinNode {\n"
-  " int value;\n"
-  " atomic_uintptr_t pNext;\n"
-  "} BinNode;\n"
-
-  "__kernel void build_hash_table(__global uint* input, __global BinNode* pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n"
-  "{\n"
-  " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, 1, memory_order_relaxed, memory_scope_all_svm_devices) ];\n"
-  " uint i = get_global_id(0);\n"
-  " uint b = input[i] % numBins;\n"
-  " pNew->value = input[i];\n"
-  " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), memory_order_seq_cst, memory_scope_all_svm_devices);\n"
-  " do\n"
-  " {\n"
-  "   atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, memory_scope_all_svm_devices);\n" // always inserting at head of list
-  " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), &next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, memory_scope_all_svm_devices));\n"
-  "}\n";
+    "#if 0\n"
+    "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
+    "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
+    "#endif\n"
+    "typedef struct BinNode {\n"
+    " int value;\n"
+    " atomic_uintptr_t pNext;\n"
+    "} BinNode;\n"
+
+    "__kernel void build_hash_table(__global uint* input, __global BinNode* "
+    "pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n"
+    "{\n"
+    " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, "
+    "1u, memory_order_relaxed, memory_scope_all_svm_devices) ];\n"
+    " uint i = get_global_id(0);\n"
+    " uint b = input[i] % numBins;\n"
+    " pNew->value = input[i];\n"
+    " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), "
+    "memory_order_seq_cst, memory_scope_all_svm_devices);\n"
+    " do\n"
+    " {\n"
+    "   atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, "
+    "memory_scope_all_svm_devices);\n" // always inserting at head of list
+    " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), "
+    "&next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, "
+    "memory_scope_all_svm_devices));\n"
+    "}\n";
 
 typedef struct BinNode{
   cl_uint value;
diff --git a/test_conformance/SVM/test_fine_grain_sync_buffers.cpp b/test_conformance/SVM/test_fine_grain_sync_buffers.cpp
index 4cc34952b3..0b94cbf2f5 100644
--- a/test_conformance/SVM/test_fine_grain_sync_buffers.cpp
+++ b/test_conformance/SVM/test_fine_grain_sync_buffers.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,15 +17,19 @@
 
 const char *find_targets_kernel[] = {
 
-  "__kernel void find_targets(__global uint* image, uint target, volatile __global atomic_uint *numTargetsFound, volatile __global atomic_uint *targetLocations)\n"
-  "{\n"
-  " size_t i = get_global_id(0);\n"
-  " uint index;\n"
-  " if(image[i] == target) {\n"
-  "   index = atomic_fetch_add_explicit(numTargetsFound, 1, memory_order_relaxed, memory_scope_device); \n"
-  "   atomic_exchange_explicit(&targetLocations[index], i, memory_order_relaxed, memory_scope_all_svm_devices); \n"
-  " }\n"
-  "}\n"
+    "__kernel void find_targets(__global uint* image, uint target, volatile "
+    "__global atomic_uint *numTargetsFound, volatile __global atomic_uint "
+    "*targetLocations)\n"
+    "{\n"
+    " size_t i = get_global_id(0);\n"
+    " uint index;\n"
+    " if(image[i] == target) {\n"
+    "   index = atomic_fetch_add_explicit(numTargetsFound, 1u, "
+    "memory_order_relaxed, memory_scope_device); \n"
+    "   atomic_exchange_explicit(&targetLocations[index], i, "
+    "memory_order_relaxed, memory_scope_all_svm_devices); \n"
+    " }\n"
+    "}\n"
 };
 
 
diff --git a/test_conformance/device_execution/enqueue_ndrange.cpp b/test_conformance/device_execution/enqueue_ndrange.cpp
index 84ac339f58..8ced6629d8 100644
--- a/test_conformance/device_execution/enqueue_ndrange.cpp
+++ b/test_conformance/device_execution/enqueue_ndrange.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -27,271 +27,316 @@
 
 #ifdef CL_VERSION_2_0
 extern int gWimpyMode;
-static const char* helper_ndrange_1d_glo[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int i = 0; i < n; i++)"
-    NL, "  {"
-    NL, "    ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);"
-    NL, "    int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_1d_glo[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
+    "memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
+    "atomic_uint* val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int i = 0; i < n; i++)" NL,
+    "  {" NL,
+    "    ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" NL,
+    "    int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "  }" NL,
+    "}" NL
 };
 
-static const char* helper_ndrange_1d_loc[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int k = 0; k < n; k++)"
-    NL, "  {"
-    NL, "    for(int i = 0; i < n; i++)"
-    NL, "    {"
-    NL, "      if (glob_size_arr[i] >= loc_size_arr[k])"
-    NL, "      {"
-    NL, "        ndrange_t ndrange = ndrange_1D(glob_size_arr[i], loc_size_arr[k]);"
-    NL, "        int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "      }"
-    NL, "    }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_1d_loc[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
+    "memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
+    "atomic_uint* val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int k = 0; k < n; k++)" NL,
+    "  {" NL,
+    "    for(int i = 0; i < n; i++)" NL,
+    "    {" NL,
+    "      if (glob_size_arr[i] >= loc_size_arr[k])" NL,
+    "      {" NL,
+    "        ndrange_t ndrange = ndrange_1D(glob_size_arr[i], "
+    "loc_size_arr[k]);" NL,
+    "        int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "      }" NL,
+    "    }" NL,
+    "  }" NL,
+    "}" NL
 };
 
-static const char* helper_ndrange_1d_ofs[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[(get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int l = 0; l < n; l++)"
-    NL, "  {"
-    NL, "    for(int k = 0; k < n; k++)"
-    NL, "    {"
-    NL, "      for(int i = 0; i < n; i++)"
-    NL, "      {"
-    NL, "        if (glob_size_arr[i] >= loc_size_arr[k])"
-    NL, "        {"
-    NL, "          ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], loc_size_arr[k]);"
-    NL, "          int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "        }"
-    NL, "      }"
-    NL, "    }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_1d_ofs[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[(get_global_offset(0) + "
+    "get_global_linear_id()) % len], 1u, memory_order_relaxed, "
+    "memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
+    "atomic_uint* val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int l = 0; l < n; l++)" NL,
+    "  {" NL,
+    "    for(int k = 0; k < n; k++)" NL,
+    "    {" NL,
+    "      for(int i = 0; i < n; i++)" NL,
+    "      {" NL,
+    "        if (glob_size_arr[i] >= loc_size_arr[k])" NL,
+    "        {" NL,
+    "          ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], "
+    "loc_size_arr[k]);" NL,
+    "          int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "        }" NL,
+    "      }" NL,
+    "    }" NL,
+    "  }" NL,
+    "}" NL
 };
 
-static const char* helper_ndrange_2d_glo[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int i = 0; i < n; i++)"
-    NL, "  {"
-    NL, "    size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };"
-    NL, "    ndrange_t ndrange = ndrange_2D(glob_size);"
-    NL, "    int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_2d_glo[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
+    "memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
+    "val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int i = 0; i < n; i++)" NL,
+    "  {" NL,
+    "    size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] "
+    "};" NL,
+    "    ndrange_t ndrange = ndrange_2D(glob_size);" NL,
+    "    int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "  }" NL,
+    "}" NL
 };
 
-static const char* helper_ndrange_2d_loc[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int k = 0; k < n; k++)"
-    NL, "  {"
-    NL, "    for(int i = 0; i < n; i++)"
-    NL, "    {"
-    NL, "      if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])"
-    NL, "      {"
-    NL, "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };"
-    NL, "        size_t loc_size[] = { 1, loc_size_arr[k] };"
-    NL, ""
-    NL, "        ndrange_t ndrange = ndrange_2D(glob_size, loc_size);"
-    NL, "        int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "      }"
-    NL, "    }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_2d_loc[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
+    "memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
+    "val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int k = 0; k < n; k++)" NL,
+    "  {" NL,
+    "    for(int i = 0; i < n; i++)" NL,
+    "    {" NL,
+    "      if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
+    "      {" NL,
+    "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
+    "n] };" NL,
+    "        size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
+    "" NL,
+    "        ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" NL,
+    "        int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "      }" NL,
+    "    }" NL,
+    "  }" NL,
+    "}" NL
 };
 
 
-static const char* helper_ndrange_2d_ofs[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[(get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int l = 0; l < n; l++)"
-    NL, "  {"
-    NL, "    for(int k = 0; k < n; k++)"
-    NL, "    {"
-    NL, "      for(int i = 0; i < n; i++)"
-    NL, "      {"
-    NL, "        if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])"
-    NL, "        {"
-    NL, "          size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n]};"
-    NL, "          size_t loc_size[] = { 1, loc_size_arr[k] };"
-    NL, "          size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };"
-    NL, ""
-    NL, "          ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);"
-    NL, "          int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "        }"
-    NL, "      }"
-    NL, "    }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_2d_ofs[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[(get_global_offset(1) * "
+    "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
+    "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
+    "val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int l = 0; l < n; l++)" NL,
+    "  {" NL,
+    "    for(int k = 0; k < n; k++)" NL,
+    "    {" NL,
+    "      for(int i = 0; i < n; i++)" NL,
+    "      {" NL,
+    "        if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
+    "        {" NL,
+    "          size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) "
+    "% n]};" NL,
+    "          size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
+    "          size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" NL,
+    "" NL,
+    "          ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" NL,
+    "          int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "        }" NL,
+    "      }" NL,
+    "    }" NL,
+    "  }" NL,
+    "}" NL
 };
 
 
-static const char* helper_ndrange_3d_glo[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int i = 0; i < n; i++)"
-    NL, "  {"
-    NL, "    uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
-    NL, "    if (global_work_size <= (len * len))"
-    NL, "    {"
-    NL, "      size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };"
-    NL, "      ndrange_t ndrange = ndrange_3D(glob_size);"
-    NL, "      int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "      if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "    }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_3d_glo[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
+    "memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
+    "val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int i = 0; i < n; i++)" NL,
+    "  {" NL,
+    "    uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % "
+    "n] * glob_size_arr[(i + 2) % n];" NL,
+    "    if (global_work_size <= (len * len))" NL,
+    "    {" NL,
+    "      size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
+    "n], glob_size_arr[(i + 2) % n] };" NL,
+    "      ndrange_t ndrange = ndrange_3D(glob_size);" NL,
+    "      int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "      if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "    }" NL,
+    "  }" NL,
+    "}" NL
 };
 
 
-static const char* helper_ndrange_3d_loc[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int k = 0; k < n; k++)"
-    NL, "  {"
-    NL, "    for(int i = 0; i < n; i++)"
-    NL, "    {"
-    NL, "      uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
-    NL, "      if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))"
-    NL, "      {"
-    NL, "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };"
-    NL, "        size_t loc_size[] = { 1, 1, loc_size_arr[k] };"
-    NL, "        ndrange_t ndrange = ndrange_3D(glob_size,loc_size);"
-    NL, "        int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "      "
-    NL, "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "      }"
-    NL, "    }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_3d_loc[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
+    "memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
+    "val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int k = 0; k < n; k++)" NL,
+    "  {" NL,
+    "    for(int i = 0; i < n; i++)" NL,
+    "    {" NL,
+    "      uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % "
+    "n] * glob_size_arr[(i + 2) % n];" NL,
+    "      if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
+    "global_work_size <= (len * len))" NL,
+    "      {" NL,
+    "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
+    "n], glob_size_arr[(i + 2) % n] };" NL,
+    "        size_t loc_size[] = { 1, 1, loc_size_arr[k] };" NL,
+    "        ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" NL,
+    "        int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "      " NL,
+    "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "      }" NL,
+    "    }" NL,
+    "  }" NL,
+    "}" NL
 };
 
-static const char* helper_ndrange_3d_ofs[] =
-{
-    NL, "void block_fn(int len, __global atomic_uint* val)"
-    NL, "{"
-    NL, "  atomic_fetch_add_explicit(&val[(get_global_offset(2) * get_global_size(0) * get_global_size(1) + get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
-    NL, "}"
-    NL, ""
-    NL, "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
-    NL, "{"
-    NL, "  size_t tid = get_global_id(0);"
-    NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
-    NL, ""
-    NL, "  for(int l = 0; l < n; l++)"
-    NL, "  {"
-    NL, "    for(int k = 0; k < n; k++)"
-    NL, "    {"
-    NL, "      for(int i = 0; i < n; i++)"
-    NL, "      {"
-    NL, "        uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
-    NL, "        if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))"
-    NL, "        {"
-    NL, "          size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n]};"
-    NL, "          size_t loc_size[3] = { 1, 1, loc_size_arr[k] };"
-    NL, "          size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l + 2) % n] };"
-    NL, "          ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);"
-    NL, "          int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
-    NL, "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
-    NL, "        }"
-    NL, "      }"
-    NL, "    }"
-    NL, "  }"
-    NL, "}"
-    NL
+static const char *helper_ndrange_3d_ofs[] = {
+    NL,
+    "void block_fn(int len, __global atomic_uint* val)" NL,
+    "{" NL,
+    "  atomic_fetch_add_explicit(&val[(get_global_offset(2) * "
+    "get_global_size(0) * get_global_size(1) + get_global_offset(1) * "
+    "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
+    "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
+    "}" NL,
+    "" NL,
+    "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, "
+    "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
+    "val,  __global uint* ofs_arr)" NL,
+    "{" NL,
+    "  size_t tid = get_global_id(0);" NL,
+    "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
+    "" NL,
+    "  for(int l = 0; l < n; l++)" NL,
+    "  {" NL,
+    "    for(int k = 0; k < n; k++)" NL,
+    "    {" NL,
+    "      for(int i = 0; i < n; i++)" NL,
+    "      {" NL,
+    "        uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) "
+    "% n] * glob_size_arr[(i + 2) % n];" NL,
+    "        if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
+    "global_work_size <= (len * len))" NL,
+    "        {" NL,
+    "          size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) "
+    "% n], glob_size_arr[(i + 2) % n]};" NL,
+    "          size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" NL,
+    "          size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l "
+    "+ 2) % n] };" NL,
+    "          ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" NL,
+    "          int enq_res = enqueue_kernel(get_default_queue(), "
+    "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
+    "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
+    "        }" NL,
+    "      }" NL,
+    "    }" NL,
+    "  }" NL,
+    "}" NL
 };
 
 static const kernel_src_dim_check sources_ndrange_Xd[] =

From fad6a005c99932346e60b5add36fb83ac583597d Mon Sep 17 00:00:00 2001
From: Stephen Clarke <stephen.clarke@imgtec.com>
Date: Wed, 3 Feb 2021 14:34:18 +0000
Subject: [PATCH 034/158] Fix test_vector_swizzle possible overwrite of padding
 of 3-element vectors (#1124)

Use vstore3 to ensure padding is not overwritten in destination buffer.

Fixes #1120
---
 .../basic/test_vector_swizzle.cpp             | 48 +++++++++++++------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/test_conformance/basic/test_vector_swizzle.cpp b/test_conformance/basic/test_vector_swizzle.cpp
index 67bf753754..5ab3ea4fd2 100644
--- a/test_conformance/basic/test_vector_swizzle.cpp
+++ b/test_conformance/basic/test_vector_swizzle.cpp
@@ -94,11 +94,17 @@ __kernel void test_vector_swizzle_xyzw(TYPE value, __global TYPE* dst) {
     int index = 0;
 
     // lvalue swizzles
-    dst[index++].x = value.x;
-    dst[index++].y = value.x;
-    dst[index++].z = value.x;
-    dst[index++].xyz = value;
-    dst[index++].zyx = value;
+    TYPE t;
+    t = dst[index]; t.x = value.x;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.y = value.x;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.z = value.x;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.xyz = value;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.zyx = value;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
 
     // rvalue swizzles
     vstore3(value.x, 0, (__global BASETYPE*)(dst + index++));
@@ -114,11 +120,17 @@ __kernel void test_vector_swizzle_rgba(TYPE value, __global TYPE* dst) {
     int index = 0;
 
     // lvalue swizzles
-    dst[index++].r = value.r;
-    dst[index++].g = value.r;
-    dst[index++].b = value.r;
-    dst[index++].rgb = value;
-    dst[index++].bgr = value;
+    TYPE t;
+    t = dst[index]; t.r = value.r;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.g = value.r;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.b = value.r;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.rgb = value;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.bgr = value;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
 
     // rvalue swizzles
     vstore3(value.r, 0, (__global BASETYPE*)(dst + index++));
@@ -134,11 +146,17 @@ __kernel void test_vector_swizzle_sN(TYPE value, __global TYPE* dst) {
     int index = 0;
 
     // lvalue swizzles
-    dst[index++].s0 = value.s0;
-    dst[index++].s1 = value.s0;
-    dst[index++].s2 = value.s0;
-    dst[index++].s012 = value;
-    dst[index++].s210 = value;
+    TYPE t;
+    t = dst[index]; t.s0 = value.s0;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.s1 = value.s0;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.s2 = value.s0;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.s012 = value;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
+    t = dst[index]; t.s210 = value;
+    vstore3(t, 0, (__global BASETYPE*)(dst + index++));
 
     // rvalue swizzles
     vstore3(value.s0, 0, (__global BASETYPE*)(dst + index++));

From b67f6bbb296285cdea9ca7cf9597d8f91380e3e6 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 5 Feb 2021 10:29:45 +0000
Subject: [PATCH 035/158] Fix command line interface for math_brute_force
 (#1145)

Ensure the following forms of command lines are supported, as per usage
message (-h):
 - math_brute_force [<name1> [<name2> ... [<nameN>]]]
 - math_brute_force I [J]

Remove dead/unnecessary code.

Fix regression introduced in f337e0b6 ( Fix command-line function range
for bruteforce  (#1127), 2021-01-29).

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/main.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 254b22aac6..1b63c8288a 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -140,7 +140,8 @@ int doTest(const char *name)
         const Func *const temp_func = functionList + i;
         if (strcmp(temp_func->name, name) == 0)
         {
-            if (i < gStartTestNumber || i > gEndTestNumber)
+            if ((gStartTestNumber != -1 && i < gStartTestNumber)
+                || i > gEndTestNumber)
             {
                 vlog("Skipping function #%d\n", i);
                 return 0;
@@ -844,10 +845,6 @@ int main(int argc, const char *argv[])
          "----------------------------------------\n");
 
     gMTdata = init_genrand(gRandomSeed);
-    if (gEndTestNumber == 0)
-    {
-        gEndTestNumber = functionListCount;
-    }
 
     FPU_mode_type oldMode;
     DisableFTZ(&oldMode);

From 3719a0183c6187f54c42723890b30b7756af8ed1 Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierremoreau@users.noreply.github.com>
Date: Fri, 5 Feb 2021 12:34:10 +0100
Subject: [PATCH 036/158] api: Fix testing of local memory size requirement
 (#1112)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current test would always end up assuming a minimum of 16 KB
regardless of the exposed OpenCL version, and the logic for testing the
OpenCL version was hard to follow.

Besides fixing the test for OpenCL 1.1 through 2.1, it also
* adds support for OpenCL 2.2, 3.0, and future OpenCL versions (as long
  as `get_device_cl_version()` supports those);
* adapts the error message to mention the currently exposed OpenCL
  version rather than a hardcoded OpenCL 1.1;
* reports the advertised local memory size as KB and not Kb, since local
  memory size is given in bytes.
---
 test_conformance/api/test_api_min_max.cpp | 41 ++++++++++++-----------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/test_conformance/api/test_api_min_max.cpp b/test_conformance/api/test_api_min_max.cpp
index 9ac4aae334..9e981cd3f3 100644
--- a/test_conformance/api/test_api_min_max.cpp
+++ b/test_conformance/api/test_api_min_max.cpp
@@ -1670,8 +1670,7 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co
     size_t    threads[1], localThreads[1];
     cl_int *localData, *resultData;
     cl_ulong maxSize, kernelLocalUsage, min_max_local_mem_size;
-    cl_char buffer[ 4098 ];
-    size_t length;
+    Version device_version;
     int i;
     int err = 0;
     MTdata d;
@@ -1680,31 +1679,33 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co
     error = clGetDeviceInfo( deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( maxSize ), &maxSize, 0 );
     test_error( error, "Unable to get max local buffer size" );
 
-    // Device version should fit the regex "OpenCL [0-9]+\.[0-9]+ *.*"
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_VERSION, sizeof( buffer ), buffer, &length );
-    test_error( error, "Unable to get device version string" );
+    try
+    {
+        device_version = get_device_cl_version(deviceID);
+    } catch (const std::runtime_error &e)
+    {
+        log_error("%s", e.what());
+        return -1;
+    }
+
     if (!gIsEmbedded)
     {
-        if( memcmp( buffer, "OpenCL 2.0", strlen( "OpenCL 2.0" ) ) == 0 )
-            min_max_local_mem_size = 16L * 1024L;
-        else if( memcmp( buffer, "OpenCL 2.1", strlen( "OpenCL 2.1" ) ) != 0 )
-            min_max_local_mem_size = 16L * 1024L;
-        else if( memcmp( buffer, "OpenCL 1.2", strlen( "OpenCL 1.2" ) ) != 0 )
+        if (device_version == Version(1, 0))
             min_max_local_mem_size = 16L * 1024L;
-        else if( memcmp( buffer, "OpenCL 1.1", strlen( "OpenCL 1.1" ) ) != 0 )
-            min_max_local_mem_size = 16L * 1024L;
-        else if ( memcmp( buffer, "OpenCL 1.0", strlen( "OpenCL 1.0" ) ) != 0 )
-            min_max_local_mem_size = 32L * 1024L;
         else
-        {
-            log_error( "ERROR: device version string does not match required format! (returned: %s)\n", (char *)buffer );
-            return -1;
-        }
+            min_max_local_mem_size = 32L * 1024L;
+    }
+    else
+    {
+        min_max_local_mem_size = 1L * 1024L;
     }
 
-    if( maxSize < (gIsEmbedded ? 1L * 1024L : min_max_local_mem_size) )
+    if (maxSize < min_max_local_mem_size)
     {
-        log_error( "ERROR: Reported local mem size less than required by OpenCL 1.1 (reported %dKb)\n", (int)( maxSize / 1024L ) );
+        const std::string version_as_string = device_version.to_string();
+        log_error("ERROR: Reported local mem size less than required by OpenCL "
+                  "%s (reported %d KB)\n",
+                  version_as_string.c_str(), (int)(maxSize / 1024L));
         return -1;
     }
 

From f6b501352dcbe880a1ebf485c12f0b1f7cbf9494 Mon Sep 17 00:00:00 2001
From: Chetan Mistry <70694498+chemis01@users.noreply.github.com>
Date: Fri, 5 Feb 2021 16:36:16 +0000
Subject: [PATCH 037/158] Implement Negative Test for Platform Layer Functions
 (#1076)

* Implement Negative Tests for clPlatform Functions

This change introduces negative tests for clPlatform
functions as well as changes to the Harness to help with
other negative tests.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Remove magic macro from Negative Platform Tests

This change removes the negative-testing macro and all
other changes related to its usage.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>
---
 test_common/harness/errorHelpers.cpp       |  1 +
 test_common/harness/testHarness.cpp        |  9 +++
 test_common/harness/testHarness.h          |  2 +
 test_conformance/api/CMakeLists.txt        |  1 +
 test_conformance/api/main.cpp              |  2 +
 test_conformance/api/negative_platform.cpp | 82 ++++++++++++++++++++++
 test_conformance/api/procs.h               |  8 +++
 7 files changed, 105 insertions(+)
 create mode 100644 test_conformance/api/negative_platform.cpp

diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index 8f3c188309..da1660f1e3 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -68,6 +68,7 @@ const char *IGetErrorString(int clErrorCode)
         case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER";
         case CL_INVALID_BINARY: return "CL_INVALID_BINARY";
         case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS";
+        case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM";
         case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM";
         case CL_INVALID_PROGRAM_EXECUTABLE:
             return "CL_INVALID_PROGRAM_EXECUTABLE";
diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 48dd482932..6b4c720197 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -1161,6 +1161,15 @@ test_status check_spirv_compilation_readiness(cl_device_id device)
     return TEST_PASS;
 }
 
+cl_platform_id getPlatformFromDevice(cl_device_id deviceID)
+{
+    cl_platform_id platform = nullptr;
+    cl_int err = clGetDeviceInfo(deviceID, CL_DEVICE_PLATFORM, sizeof(platform),
+                                 &platform, nullptr);
+    ASSERT_SUCCESS(err, "clGetDeviceInfo");
+    return platform;
+}
+
 void PrintArch(void)
 {
     vlog("sizeof( void*) = %ld\n", sizeof(void *));
diff --git a/test_common/harness/testHarness.h b/test_common/harness/testHarness.h
index 331555b2b2..d6054de981 100644
--- a/test_common/harness/testHarness.h
+++ b/test_common/harness/testHarness.h
@@ -178,6 +178,8 @@ extern int gHasLong; // This is set to 1 if the device suppots long and ulong
                      // types in OpenCL C.
 extern bool gCoreILProgram;
 
+extern cl_platform_id getPlatformFromDevice(cl_device_id deviceID);
+
 #if !defined(__APPLE__)
 void memset_pattern4(void *, const void *, size_t);
 #endif
diff --git a/test_conformance/api/CMakeLists.txt b/test_conformance/api/CMakeLists.txt
index 66efcc7b57..20cb9b82dc 100644
--- a/test_conformance/api/CMakeLists.txt
+++ b/test_conformance/api/CMakeLists.txt
@@ -2,6 +2,7 @@ set(MODULE_NAME API)
 
 set(${MODULE_NAME}_SOURCES
          main.cpp
+         negative_platform.cpp
          test_api_consistency.cpp
          test_bool.cpp
          test_retain.cpp
diff --git a/test_conformance/api/main.cpp b/test_conformance/api/main.cpp
index 10e2b57eba..16ca81c4c5 100644
--- a/test_conformance/api/main.cpp
+++ b/test_conformance/api/main.cpp
@@ -146,6 +146,8 @@ test_definition test_list[] = {
     ADD_TEST_VERSION(consistency_3d_image_writes, Version(3, 0)),
 
     ADD_TEST(min_image_formats),
+    ADD_TEST(negative_get_platform_info),
+    ADD_TEST(negative_get_platform_ids),
 };
 
 const int test_num = ARRAY_SIZE(test_list);
diff --git a/test_conformance/api/negative_platform.cpp b/test_conformance/api/negative_platform.cpp
new file mode 100644
index 0000000000..d41b35fee2
--- /dev/null
+++ b/test_conformance/api/negative_platform.cpp
@@ -0,0 +1,82 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "testBase.h"
+
+int test_negative_get_platform_ids(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
+{
+    cl_platform_id platform;
+    cl_int err = clGetPlatformIDs(0, &platform, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clGetPlatformIDs should return CL_INVALID_VALUE when: \"num_entries "
+        "is equal to zero and platforms is not NULL\"",
+        TEST_FAIL);
+
+    err = clGetPlatformIDs(1, nullptr, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clGetPlatformIDs should return CL_INVALID_VALUE when: \"both "
+        "num_platforms and platforms are NULL\"",
+        TEST_FAIL);
+
+    return TEST_PASS;
+}
+
+int test_negative_get_platform_info(cl_device_id deviceID, cl_context context,
+                                    cl_command_queue queue, int num_elements)
+{
+    cl_platform_id platform = getPlatformFromDevice(deviceID);
+
+    cl_int err = clGetPlatformInfo(nullptr, CL_PLATFORM_VERSION, sizeof(char*),
+                                   nullptr, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_PLATFORM,
+        "clGetPlatformInfo should return CL_INVALID_PLATFORM  when: \"platform "
+        "is not a valid platform\" using a nullptr",
+        TEST_FAIL);
+
+    err =
+        clGetPlatformInfo(reinterpret_cast<cl_platform_id>(deviceID),
+                          CL_PLATFORM_VERSION, sizeof(char*), nullptr, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_PLATFORM,
+        "clGetPlatformInfo should return CL_INVALID_PLATFORM  when: \"platform "
+        "is not a valid platform\" using a valid object which is NOT a "
+        "platform",
+        TEST_FAIL);
+
+    constexpr cl_platform_info INVALID_PARAM_VALUE = 0;
+    err = clGetPlatformInfo(platform, INVALID_PARAM_VALUE, 0, nullptr, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clGetPlatformInfo should return CL_INVALID_VALUE when: \"param_name "
+        "is not one of the supported values\"",
+        TEST_FAIL);
+
+    char* version;
+    err =
+        clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, &version, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clGetPlatformInfo should return CL_INVALID_VALUE when: \"size in "
+        "bytes specified by param_value_size is < size of return type and "
+        "param_value is not a NULL value\"",
+        TEST_FAIL);
+
+    return TEST_PASS;
+}
diff --git a/test_conformance/api/procs.h b/test_conformance/api/procs.h
index 0dcc9a6901..af373e43c3 100644
--- a/test_conformance/api/procs.h
+++ b/test_conformance/api/procs.h
@@ -195,3 +195,11 @@ extern int test_consistency_3d_image_writes(cl_device_id deviceID,
 
 extern int test_min_image_formats(cl_device_id deviceID, cl_context context,
                                   cl_command_queue queue, int num_elements);
+extern int test_negative_get_platform_info(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements);
+extern int test_negative_get_platform_ids(cl_device_id deviceID,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements);

From 8ad1088af97687d249111046e7e96e49e3458b30 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 10 Feb 2021 10:38:31 +0000
Subject: [PATCH 038/158] Reduce difference between files in math_brute_force
 (#1138)

* Reduce differences between files

This will help reduce code duplication is future commits.

Some code is moved around, some variables are renamed and some
statements are slightly altered to reduce differences between files in
math_brute_force, yet the semantics remains the same.

The differences were identified using n-way diffs. Many differences
remain however.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Workaround clang-format limitation

Introduces some insignificant spaces to force clang-format to reduce the
indentation and reduce differences between files.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary.cpp  | 109 +++++++-------
 .../math_brute_force/binaryOperator.cpp       | 140 +++++++++---------
 .../math_brute_force/binary_i.cpp             |  50 +++----
 .../math_brute_force/binary_two_results_i.cpp |  35 +++--
 test_conformance/math_brute_force/i_unary.cpp |  55 ++++---
 .../math_brute_force/macro_binary.cpp         |  69 +++++----
 .../math_brute_force/macro_unary.cpp          |  34 +++--
 test_conformance/math_brute_force/mad.cpp     |  60 +++++---
 test_conformance/math_brute_force/ternary.cpp |  77 +++++-----
 test_conformance/math_brute_force/unary.cpp   |  52 +++----
 .../math_brute_force/unary_two_results.cpp    |  39 ++---
 .../math_brute_force/unary_two_results_i.cpp  |  47 +++---
 test_conformance/math_brute_force/unary_u.cpp | 137 ++++++++---------
 13 files changed, 482 insertions(+), 422 deletions(-)

diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index 1784c725d9..e6b9cbbc35 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -25,9 +25,6 @@ int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata,
 int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata,
                                             bool relaxedMode);
 
-const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
-const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
-
 extern const vtbl _binary = { "binary", TestFunc_Float_Float_Float,
                               TestFunc_Double_Double_Double };
 
@@ -36,6 +33,8 @@ extern const vtbl _binary_nextafter = {
     TestFunc_Double_Double_Double_nextafter
 };
 
+const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
+const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -76,7 +75,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -163,7 +163,8 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -208,6 +209,35 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                        relaxedMode);
 }
 
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
+}
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -311,38 +341,9 @@ static const float specialValuesFloat[] = {
     +0.0f
 };
 
-static size_t specialValuesFloatCount =
+static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
-}
-
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -426,11 +427,11 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
     test_info.ftz =
         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
+    test_info.relaxedMode = relaxedMode;
     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
     test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
     test_info.isNextafter = isNextafter;
-    test_info.relaxedMode = relaxedMode;
+
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -543,12 +544,11 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
         for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
@@ -561,6 +561,7 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -568,7 +569,6 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -633,7 +633,6 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
@@ -684,23 +683,21 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     int skipNanInf = job->skipNanInf;
     int isNextafter = job->isNextafter;
     cl_uint *t = 0;
-    float *r = 0, *s = 0, *s2 = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
     cl_int copysign_test = 0;
     RoundingMode oldRoundMode;
     int skipVerification = 0;
 
     if (relaxedMode)
     {
+        func = job->f->rfunc;
         if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
         {
-            func = job->f->rfunc;
             ulps = INFINITY;
             skipVerification = 1;
         }
-        else
-        {
-            func = job->f->rfunc;
-        }
     }
 
     // start the map of the output arrays
@@ -744,7 +741,8 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if (++x >= specialValuesFloatCount)
+            ++x;
+            if (x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
@@ -1203,13 +1201,11 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         fflush(stdout);
     }
 
-
 exit:
     if (overflow) free(overflow);
     return error;
 }
 
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
@@ -1444,10 +1440,10 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
+
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = {
@@ -1460,6 +1456,7 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1500,6 +1497,7 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -1507,7 +1505,6 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -1573,7 +1570,6 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -1622,7 +1618,9 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 
     int isNextafter = job->isNextafter;
     cl_ulong *t;
-    cl_double *r, *s, *s2;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
 
     Force64BitFPUPrecision();
 
@@ -1970,6 +1968,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
+
 exit:
     return error;
 }
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index bd1a3143c0..0957c6af62 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -44,10 +44,11 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                         "* in2 )\n"
                         "{\n"
                         "   size_t i = get_global_id(0);\n"
-                        "   out[i] =  in1[i] ",
+                        "   out[i] = in1[i] ",
                         operator_symbol,
                         " in2[i];\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void ",
         name,
@@ -70,7 +71,8 @@ static int BuildKernel(const char *name, const char *operator_symbol,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -137,8 +139,9 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
                         operator_symbol,
                         " in2[i];\n"
                         "}\n" };
+
     const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void ",
         name,
         "_kernel",
@@ -160,7 +163,8 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -235,43 +239,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-// Thread specific data for a worker thread
-typedef struct ThreadInfo
-{
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    float maxError; // max error value. Init to 0.
-    double
-        maxErrorValue; // position of the max error value (param 1).  Init to 0.
-    double maxErrorValue2; // position of the max error value (param 2).  Init
-                           // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-
-typedef struct TestInfo
-{
-    size_t subBufferSize; // Size of the sub-buffer in elements
-    const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
-    cl_uint threadCount; // Number of worker threads
-    cl_uint jobCount; // Number of jobs
-    cl_uint step; // step between each chunk and the next.
-    cl_uint scale; // stride between individual test values
-    float ulps; // max_allowed ulps
-    int ftz; // non-zero if running in flush to zero mode
-    bool relaxedMode; // True if the test is being run in relaxed mode, false
-                      // otherwise.
-
-    // no special fields
-} TestInfo;
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -375,9 +342,46 @@ static const float specialValuesFloat[] = {
     +0.0f
 };
 
-static size_t specialValuesFloatCount =
+static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if the test is being run in relaxed mode, false
+                      // otherwise.
+
+    // no special fields
+} TestInfo;
+
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
@@ -398,6 +402,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -405,7 +410,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
         // there was overflow
@@ -481,8 +486,8 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -513,6 +518,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
@@ -536,7 +542,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
@@ -554,6 +559,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -626,7 +632,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
@@ -665,30 +670,31 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
     fptr func = job->f->func;
+    int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
-    if (relaxedMode)
-    {
-        func = job->f->rfunc;
-    }
-
-
-    int ftz = job->ftz;
     MTdata d = tinfo->d;
     cl_uint j, k;
     cl_int error;
     cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
     const char *name = job->f->name;
-    cl_uint *t;
-    cl_float *r, *s, *s2;
+    cl_uint *t = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
     RoundingMode oldRoundMode;
 
+    if (relaxedMode)
+    {
+        func = job->f->rfunc;
+    }
+
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
             buffer_size, 0, NULL, e + j, &error);
         if (error || NULL == out[j])
@@ -711,7 +717,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-
     if (job_id <= (cl_uint)indx)
     {
         // Insert special values
@@ -877,7 +882,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
@@ -889,9 +894,9 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
@@ -1136,6 +1141,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
+
 exit:
     if (overflow) free(overflow);
     return error;
@@ -1267,6 +1273,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     double maxErrorVal2 = 0.0;
+
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
@@ -1275,6 +1282,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -1373,7 +1381,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex,
@@ -1389,6 +1396,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1412,7 +1420,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
@@ -1503,7 +1510,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -1551,7 +1557,9 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_int error;
     const char *name = job->f->name;
     cl_ulong *t;
-    cl_double *r, *s, *s2;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index a8535281b5..f931c5be49 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -15,8 +15,8 @@
 //
 #include "Utility.h"
 
-#include <string.h>
 #include <limits.h>
+#include <string.h>
 #include "FunctionList.h"
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
@@ -228,7 +228,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -331,9 +330,9 @@ static const float specialValuesFloat[] = {
     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
     +0.0f
 };
-static size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 static const int specialValuesInt[] = {
     0,           1,           2,          3,          126,        127,
@@ -484,8 +483,8 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -497,6 +496,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
+
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
@@ -537,7 +537,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
@@ -555,6 +554,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -627,7 +627,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
@@ -658,7 +657,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
@@ -666,23 +664,24 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
     fptr func = job->f->func;
     int ftz = job->ftz;
+    float ulps = job->ulps;
     MTdata d = tinfo->d;
     cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
-    cl_uint *t;
-    cl_float *r, *s;
-    cl_int *s2;
+    cl_uint *t = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_int *s2 = 0;
 
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
             buffer_size, 0, NULL, e + j, &error);
         if (error || NULL == out[j])
@@ -700,9 +699,11 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
+
     int totalSpecialValueCount =
         specialValuesFloatCount * specialValuesIntCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
     if (job_id <= (cl_uint)indx)
     { // test edge cases
         float *fp = (float *)p;
@@ -716,7 +717,8 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         {
             fp[j] = specialValuesFloat[x];
             ip2[j] = specialValuesInt[y];
-            if (++x >= specialValuesFloatCount)
+            ++x;
+            if (x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
@@ -820,7 +822,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
@@ -832,9 +834,9 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
@@ -1057,6 +1059,7 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
     +0.0,
 };
+
 static size_t specialValuesDoubleCount =
     sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
@@ -1165,12 +1168,9 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -1190,7 +1190,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = {
@@ -1320,7 +1319,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -1367,7 +1365,8 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_int error;
     const char *name = job->f->name;
     cl_ulong *t;
-    cl_double *r, *s;
+    cl_double *r;
+    cl_double *s;
     cl_int *s2;
 
     Force64BitFPUPrecision();
@@ -1398,6 +1397,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     int totalSpecialValueCount =
         specialValuesDoubleCount * specialValuesInt2Count;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
     if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index a0aa9d2503..2ecf1c287f 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -40,7 +40,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in1, __global float",
                         sizeNames[vectorSize],
-                        "* in2)\n"
+                        "* in2 )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -71,7 +71,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -83,7 +85,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       int3 i0 = 0xdeaddead;\n"
         "       f0 = ",
         name,
         "( f0, f1, &i0 );\n"
@@ -132,12 +133,12 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in1, __global double",
                         sizeNames[vectorSize],
-                        "* in2)\n"
+                        "* in2 )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
-                        "( in1[i], in2[i], out2 + i );\n"
+                        "( in1[i], in2[i], out2[i] );\n"
                         "}\n" };
 
     const char *c3[] = {
@@ -164,7 +165,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -176,7 +179,6 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       int3 i0 = 0xdeaddead;\n"
         "       d0 = ",
         name,
         "( d0, d1, &i0 );\n"
@@ -309,20 +311,22 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-    float float_ulps;
-    int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    int64_t maxError2 = 0;
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
     cl_uint threadCount = GetThreadCount();
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps;
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
@@ -485,7 +489,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
-                uint32_t *q = (uint32_t *)gOut[k];
+                uint32_t *q = (uint32_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)gOut2[k];
 
                 // Check for exact match to correctly rounded result
@@ -695,9 +699,11 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
+        cl_uint *p = (cl_uint *)gIn;
         for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
             p[j] = genrand_int32(d);
+        }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -823,9 +829,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
-        {
             return error;
-        }
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -1185,7 +1189,6 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog(".");
             }
-
             fflush(stdout);
         }
     }
@@ -1202,7 +1205,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         // Init input array
         double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
             p[j] = DoubleFromUInt32(genrand_int32(d));
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index 7f2f79a3f8..b736b24f1f 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -42,6 +42,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -87,7 +88,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "}\n"
     };
 
-
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -114,7 +114,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -177,7 +177,6 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -219,7 +218,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
-    int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
@@ -234,27 +233,30 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
+        cl_uint *p = (cl_uint *)gIn;
         if (gWimpyMode)
         {
             for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j * scale;
+                p[j] = (cl_uint)i + j * scale;
         }
         else
         {
             for (j = 0; j < bufferSize / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -281,7 +283,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -396,8 +399,9 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -447,6 +451,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     }
 
     vlog("\n");
+
 exit:
     RestoreFPState(&oldMode);
     // Release
@@ -481,13 +486,13 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -504,6 +509,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             for (j = 0; j < bufferSize / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -529,8 +535,9 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -616,6 +623,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog(".");
             }
+
             fflush(stdout);
         }
     }
@@ -698,7 +706,6 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
     vlog("\n");
 
-
 exit:
     RestoreFPState(&oldMode);
     // Release
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index b0b8214956..0c37068e16 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -64,7 +64,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -92,7 +93,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "}\n"
     };
 
-
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -110,7 +110,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
@@ -153,7 +152,8 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 f0, f1;\n"
+        "       double3 f0;\n"
+        "       double3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -190,7 +190,6 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -228,7 +227,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -379,6 +377,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -460,8 +459,8 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -489,7 +488,6 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
-
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
@@ -506,8 +504,8 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
         for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
@@ -520,6 +518,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -531,8 +530,9 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -586,6 +586,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                       f->name, sizeNames[j]);
         }
     }
+
     vlog("\n");
 
 exit:
@@ -631,8 +632,10 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
-    cl_int *t, *r;
-    cl_float *s, *s2;
+    cl_int *t = 0;
+    cl_int *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
 
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
@@ -657,6 +660,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
+
     int totalSpecialValueCount =
         specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
@@ -674,7 +678,8 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if (++x >= specialValuesFloatCount)
+            ++x;
+            if (x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
@@ -690,7 +695,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         p2[j] = genrand_int32(d);
     }
 
-
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
                                       buffer_size, p, 0, NULL, NULL)))
     {
@@ -895,6 +899,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             }
         }
     }
+
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
@@ -1044,7 +1049,6 @@ static const double specialValuesDouble[] = {
 static size_t specialValuesDoubleCount =
     sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-
 static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
@@ -1061,6 +1065,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -1136,12 +1141,9 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -1161,7 +1163,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = {
@@ -1174,6 +1175,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1189,8 +1191,8 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input arrays
-        uint64_t *p = (uint64_t *)gIn;
-        uint64_t *p2 = (uint64_t *)gIn2;
+        cl_ulong *p = (cl_ulong *)gIn;
+        cl_ulong *p2 = (cl_ulong *)gIn2;
         for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             p[j] =
@@ -1216,8 +1218,9 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -1319,8 +1322,10 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
-    cl_long *t, *r;
-    cl_double *s, *s2;
+    cl_long *t;
+    cl_long *r;
+    cl_double *s;
+    cl_double *s2;
 
     Force64BitFPUPrecision();
 
@@ -1378,7 +1383,6 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         ((cl_ulong *)p2)[j] = genrand_int64(d);
     }
 
-
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
                                       buffer_size, p, 0, NULL, NULL)))
     {
@@ -1493,11 +1497,12 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     t = (cl_long *)r;
     for (j = 0; j < buffer_elements; j++)
     {
-        cl_long *q = (cl_long *)out[0];
+        cl_long *q = out[0];
 
         // If we aren't getting the correctly rounded result
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
+            // If we aren't getting the correctly rounded result
             if (ftz)
             {
                 if (IsDoubleSubnormal(s[j]))
@@ -1528,7 +1533,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
                 }
             }
 
-            uint64_t err = t[j] - q[j];
+            cl_ulong err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
             vlog_error("\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld "
                        "vs. %lld  (index: %d)\n",
@@ -1575,7 +1580,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
                     }
                 }
 
-                uint64_t err = -t[j] - q[j];
+                cl_ulong err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
                 vlog_error("\nERROR: %sD%s: %lld ulp error at {%.13la, "
                            "%.13la}: *%lld vs. %lld  (index: %d)\n",
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index bf08a17081..ced72be816 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -34,13 +34,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* out, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -115,7 +116,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* out, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -178,7 +179,6 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -258,6 +258,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -279,6 +280,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.f = f;
     test_info.ftz =
         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -328,8 +330,8 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -355,6 +357,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
@@ -501,7 +504,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
@@ -569,7 +571,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
@@ -594,6 +595,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
     }
+
     // Wait for the last buffer
     out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
@@ -711,12 +713,14 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
     // Init test_info
     memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -782,12 +786,9 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -817,6 +818,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -846,8 +848,9 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -900,6 +903,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -936,9 +940,9 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
     dptr dfunc = job->f->dfunc;
+    int ftz = job->ftz;
     cl_uint j, k;
     cl_int error;
-    int ftz = job->ftz;
     const char *name = job->f->name;
 
     Force64BitFPUPrecision();
@@ -1027,7 +1031,6 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
@@ -1052,6 +1055,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
     }
+
     // Wait for the last buffer
     out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
                                            CL_TRUE, CL_MAP_READ, 0, buffer_size,
@@ -1062,14 +1066,12 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         return error;
     }
 
-
     // Verify data
     cl_long *t = (cl_long *)r;
     for (j = 0; j < buffer_elements; j++)
     {
         cl_long *q = out[0];
 
-
         // If we aren't getting the correctly rounded result
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index fb144e4b8d..872caa0bee 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -44,6 +44,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -66,7 +67,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1, f2;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       float3 f2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -133,6 +136,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void math_kernel",
@@ -156,7 +160,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1, d2;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       double3 d2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -247,37 +253,42 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -379,15 +390,15 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
         if (gWimpyMode)
             vlog("Wimp pass");
         else
-            vlog("pass");
+            vlog("passed");
     }
 
     if (gMeasureTimes)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
@@ -508,18 +519,18 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -534,18 +545,21 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -647,7 +661,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         if (gWimpyMode)
             vlog("Wimp pass");
         else
-            vlog("pass");
+            vlog("passed");
     }
 
     if (gMeasureTimes)
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index b3eea0f4da..1b03b209a0 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -52,7 +52,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global float* in2 , "
+        "( __global float* out, __global float* in, __global float* in2, "
         "__global float* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
@@ -71,7 +71,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1, f2;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       float3 f2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -143,7 +145,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void math_kernel",
         sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2 , "
+        "( __global double* out, __global double* in, __global double* in2, "
         "__global double* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
@@ -162,7 +164,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1, d2;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       double3 d2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -235,7 +239,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -315,7 +318,7 @@ static const float specialValuesFloat[] = {
     +0.0f
 };
 
-static size_t specialValuesFloatCount =
+static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 
@@ -324,6 +327,9 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
@@ -332,33 +338,34 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
-    cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
-    float float_ulps;
 
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+    cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
 
+    float float_ulps;
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
 
+    int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
+
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         j = 0;
         if (i == 0)
         { // test edge cases
@@ -393,18 +400,21 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -493,7 +503,6 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
         }
 
-
         // Read the data back
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -963,9 +972,9 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
@@ -1160,21 +1169,21 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(double), bufferSize);
 
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -1213,18 +1222,21 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -1287,7 +1299,6 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             }
         }
 
-
         // Get that moving
         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index ae3f54e808..5085b9b4d7 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -37,13 +37,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* out, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -89,7 +90,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "}\n"
     };
 
-
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -118,7 +118,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* out, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -181,7 +181,6 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -249,7 +248,7 @@ typedef struct TestInfo
     int isRangeLimited; // 1 if the function is only to be evaluated over a
                         // range
     float half_sin_cos_tan_limit;
-    bool relaxedMode; // True if test is to be run in relaxed mode, false
+    bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
 } TestInfo;
 
@@ -269,10 +268,10 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init test_info
     memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -345,8 +344,8 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -390,6 +389,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting || skipTestingRelaxed)
     {
         error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
@@ -443,8 +443,9 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -479,9 +480,9 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime(endTime, startTime);
-                sum += current_time;
-                if (current_time < bestTime) bestTime = current_time;
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
             }
 
             if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
@@ -497,6 +498,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -553,7 +555,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *out[VECTOR_SIZE_COUNT];
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
             buffer_size, 0, NULL, e + j, &error);
         if (error || NULL == out[j])
@@ -627,7 +629,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
 
-        // run the kernel
+        // Run the kernel
         size_t vectorCount =
             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
@@ -655,7 +657,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
@@ -670,7 +671,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
@@ -680,6 +681,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
     }
+
     // Wait for the last buffer
     out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
                                             CL_TRUE, CL_MAP_READ, 0,
@@ -1246,12 +1248,9 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -1281,6 +1280,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1334,8 +1334,9 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -1370,9 +1371,9 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime(endTime, startTime);
-                sum += current_time;
-                if (current_time < bestTime) bestTime = current_time;
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
             }
 
             if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
@@ -1393,6 +1394,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index 77d40b0d46..a3be1d8d5e 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -36,7 +36,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -93,6 +93,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -121,7 +122,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -179,6 +180,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -242,17 +244,19 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
     int isFract = 0 == strcmp("fract", f->nameInCode);
     int skipNanInf = isFract && !gInfNanSupport;
-    float float_ulps = getAllowedUlpError(f, relaxedMode);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps = getAllowedUlpError(f, relaxedMode);
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -282,6 +286,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
                 }
             }
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -454,7 +459,6 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
                     if (relaxedMode || skipNanInf)
                     {
                         if (skipNanInf && overflow[j]) continue;
-
                         // Note: no double rounding here.  Reference functions
                         // calculate in single precision.
                         if (IsFloatInfinity(correct) || IsFloatNaN(correct)
@@ -670,6 +674,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         uint32_t *p = (uint32_t *)gIn;
         for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -706,7 +711,6 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
             double bestTime = INFINITY;
             for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
-
                 uint64_t startTime = GetTime();
                 if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                                     &localCount, NULL, 0, NULL,
@@ -775,13 +779,13 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -1103,7 +1107,6 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
             double bestTime = INFINITY;
             for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
-
                 uint64_t startTime = GetTime();
                 if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                                     &localCount, NULL, 0, NULL,
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index f3c73434b8..6c56ed1f94 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -37,13 +37,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -93,6 +94,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -121,13 +123,14 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
                         "}\n" };
+
     const char *c3[] = {
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void math_kernel",
@@ -178,6 +181,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -240,13 +244,13 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    float float_ulps;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
     cl_ulong maxiError;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps;
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
@@ -255,12 +259,14 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -600,22 +606,21 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal2 = 0.0f;
     cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -624,12 +629,12 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(double); j++)
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(double); j++)
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -928,7 +933,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             double clocksPerOp = bestTime * (double)gDeviceFrequency
                 * gComputeDevices * gSimdSize * 1e6
                 / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s",
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
         for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 53f5db384e..df6724cafe 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -33,13 +33,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out, __global uint",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -112,7 +113,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out, __global ulong",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -120,51 +121,53 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         "( in[i] );\n"
                         "}\n" };
 
-    const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                         "__kernel void math_kernel",
-                         sizeNames[vectorSize],
-                         "( __global double* out, __global ulong* in)\n"
-                         "{\n"
-                         "   size_t i = get_global_id(0);\n"
-                         "   if( i + 1 < get_global_size(0) )\n"
-                         "   {\n"
-                         "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
-                         "       double3 f0 = ",
-                         name,
-                         "( u0 );\n"
-                         "       vstore3( f0, 0, out + 3*i );\n"
-                         "   }\n"
-                         "   else\n"
-                         "   {\n"
-                         "       size_t parity = i & 1;   // Figure out how "
-                         "many elements are left over after BUFFER_SIZE % "
-                         "(3*sizeof(float)). Assume power of two buffer size \n"
-                         "       ulong3 u0;\n"
-                         "       switch( parity )\n"
-                         "       {\n"
-                         "           case 1:\n"
-                         "               u0 = (ulong3)( in[3*i], "
-                         "0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
-                         "               break;\n"
-                         "           case 0:\n"
-                         "               u0 = (ulong3)( in[3*i], in[3*i+1], "
-                         "0xdeaddeaddeaddeadUL ); \n"
-                         "               break;\n"
-                         "       }\n"
-                         "       double3 f0 = ",
-                         name,
-                         "( u0 );\n"
-                         "       switch( parity )\n"
-                         "       {\n"
-                         "           case 0:\n"
-                         "               out[3*i+1] = f0.y; \n"
-                         "               // fall through\n"
-                         "           case 1:\n"
-                         "               out[3*i] = f0.x; \n"
-                         "               break;\n"
-                         "       }\n"
-                         "   }\n"
-                         "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global ulong* in                 )\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       ulong3 u0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               u0 = (ulong3)( in[3*i], in[3*i+1], "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "       }\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
@@ -175,7 +178,6 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -221,27 +223,28 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-
     uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
     int isRangeLimited = 0;
-    float float_ulps;
     float half_sin_cos_tan_limit = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps;
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
     {
@@ -317,7 +320,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                             &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error("FAILURE -- could not execute kernel\n");
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
@@ -419,7 +422,6 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         }
     }
 
-
     if (!gSkipCorrectnessTesting)
     {
         if (gWimpyMode)
@@ -477,7 +479,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
                                                     &localCount, NULL, 0, NULL,
                                                     NULL)))
                 {
-                    vlog_error("FAILURE -- could not execute kernel\n");
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
@@ -540,13 +542,13 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -599,7 +601,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                             &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error("FAILURE -- could not execute kernel\n");
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
@@ -627,7 +629,6 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 
         if (gSkipCorrectnessTesting) break;
 
-
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         for (j = 0; j < bufferSize / sizeof(cl_double); j++)
@@ -741,7 +742,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
                                                     &localCount, NULL, 0, NULL,
                                                     NULL)))
                 {
-                    vlog_error("FAILURE -- could not execute kernel\n");
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 

From 87a1525d5311e113ccdbdd8d7947b24c50d34293 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 11 Feb 2021 11:37:14 +0000
Subject: [PATCH 039/158] NFC: clang-format test_basic_parameter_types.cpp
 (#1151)

* Use raw string literals in basic parameter test; NFC

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
Change-Id: I294d2aa95c6bab37e5efb7c8b1e43a06d31a0081

* clang-format test_basic_parameter_types.cpp; NFC

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
Change-Id: I0792037f5fa6f79d493c099bed15238f0f1486ac
---
 .../basic/test_basic_parameter_types.cpp      | 614 ++++++++++--------
 1 file changed, 358 insertions(+), 256 deletions(-)

diff --git a/test_conformance/basic/test_basic_parameter_types.cpp b/test_conformance/basic/test_basic_parameter_types.cpp
index 886da6a108..6e99d462ce 100644
--- a/test_conformance/basic/test_basic_parameter_types.cpp
+++ b/test_conformance/basic/test_basic_parameter_types.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -23,279 +23,381 @@
 
 #include "procs.h"
 
-const char *kernel_code =
-"__kernel void test_kernel(\n"
-"char%s c, uchar%s uc, short%s s, ushort%s us, int%s i, uint%s ui, float%s f,\n"
-"__global float%s *result)\n"
-"{\n"
-"  result[0] = %s(c);\n"
-"  result[1] = %s(uc);\n"
-"  result[2] = %s(s);\n"
-"  result[3] = %s(us);\n"
-"  result[4] = %s(i);\n"
-"  result[5] = %s(ui);\n"
-"  result[6] = f;\n"
-"}\n";
-
-const char *kernel_code_long =
-"__kernel void test_kernel_long(\n"
-"long%s l, ulong%s ul,\n"
-"__global float%s *result)\n"
-"{\n"
-"  result[0] = %s(l);\n"
-"  result[1] = %s(ul);\n"
-"}\n";
-
-int test_parameter_types_long(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+const char *kernel_code = R"(
+__kernel void test_kernel(
+char%s c, uchar%s uc, short%s s, ushort%s us, int%s i, uint%s ui, float%s f,
+__global float%s *result)
 {
-  clMemWrapper results;
-  int error;
-  size_t global[3] = {1, 1, 1};
-  float results_back[2*16];
-  int count, index;
-  const char* types[] = { "long", "ulong" };
-  char kernel_string[8192];
-  int sizes[] = {1, 2, 4, 8, 16};
-  const char* size_strings[] = {"", "2", "4", "8", "16"};
-  float expected;
-  int total_errors = 0;
-  int size_to_test;
-  char *ptr;
-  char convert_string[1024];
-  size_t max_parameter_size;
-
-  // We don't really care about the contents since we're just testing that the types work.
-  cl_long l[16]={-21,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
-  cl_ulong ul[16]={22,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-
-  // Calculate how large our paramter size is to the kernel
-  size_t parameter_size = sizeof(cl_long) + sizeof(cl_ulong);
-
-  // Init our strings.
-  kernel_string[0] = '\0';
-  convert_string[0] = '\0';
-
-  // Get the maximum parameter size allowed
-  error = clGetDeviceInfo( device, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( max_parameter_size ), &max_parameter_size, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
-
-  // Create the results buffer
-  results = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float)*2*16, NULL, &error);
-  test_error(error, "clCreateBuffer failed");
-
-  // Go over all the vector sizes
-  for (size_to_test = 0; size_to_test < 5; size_to_test++) {
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-
-    size_t total_parameter_size = parameter_size*sizes[size_to_test] + sizeof(cl_mem);
-    if (total_parameter_size > max_parameter_size) {
-      log_info("Can not test with vector size %d because it would exceed the maximum allowed parameter size to the kernel. (%d > %d)\n",
-               (int)sizes[size_to_test], (int)total_parameter_size, (int)max_parameter_size);
-      continue;
-    }
+    result[0] = %s(c);
+    result[1] = %s(uc);
+    result[2] = %s(s);
+    result[3] = %s(us);
+    result[4] = %s(i);
+    result[5] = %s(ui);
+    result[6] = f;
+})";
+
+const char *kernel_code_long = R"(
+__kernel void test_kernel_long(
+long%s l, ulong%s ul,
+__global float%s *result)
+{
+    result[0] = %s(l);
+    result[1] = %s(ul);
+})";
 
-    log_info("Testing vector size %d\n", sizes[size_to_test]);
+int test_parameter_types_long(cl_device_id device, cl_context context,
+                              cl_command_queue queue, int num_elements)
+{
+    clMemWrapper results;
+    int error;
+    size_t global[3] = { 1, 1, 1 };
+    float results_back[2 * 16];
+    int count, index;
+    const char *types[] = { "long", "ulong" };
+    char kernel_string[8192];
+    int sizes[] = { 1, 2, 4, 8, 16 };
+    const char *size_strings[] = { "", "2", "4", "8", "16" };
+    float expected;
+    int total_errors = 0;
+    int size_to_test;
+    char *ptr;
+    char convert_string[1024];
+    size_t max_parameter_size;
+
+    // We don't really care about the contents since we're just testing that the
+    // types work.
+    cl_long l[16] = { -21, -1, 2,  -3,  4,  -5,  6,  -7,
+                      8,   -9, 10, -11, 12, -13, 14, -15 };
+    cl_ulong ul[16] = { 22, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    // Calculate how large our paramter size is to the kernel
+    size_t parameter_size = sizeof(cl_long) + sizeof(cl_ulong);
+
+    // Init our strings.
+    kernel_string[0] = '\0';
+    convert_string[0] = '\0';
+
+    // Get the maximum parameter size allowed
+    error =
+        clGetDeviceInfo(device, CL_DEVICE_MAX_PARAMETER_SIZE,
+                        sizeof(max_parameter_size), &max_parameter_size, NULL);
+    test_error(error, "Unable to get max parameter size from device");
+
+    // Create the results buffer
+    results = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                             sizeof(cl_float) * 2 * 16, NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    // Go over all the vector sizes
+    for (size_to_test = 0; size_to_test < 5; size_to_test++)
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        size_t total_parameter_size =
+            parameter_size * sizes[size_to_test] + sizeof(cl_mem);
+        if (total_parameter_size > max_parameter_size)
+        {
+            log_info(
+                "Can not test with vector size %d because it would exceed the "
+                "maximum allowed parameter size to the kernel. (%d > %d)\n",
+                (int)sizes[size_to_test], (int)total_parameter_size,
+                (int)max_parameter_size);
+            continue;
+        }
 
-    // If size is > 1, then we need a explicit convert call.
-    if (sizes[size_to_test] > 1) {
-      sprintf(convert_string, "convert_float%s",  size_strings[size_to_test]);
-    } else {
-      sprintf(convert_string, " ");
-    }
+        log_info("Testing vector size %d\n", sizes[size_to_test]);
 
-    // Build the kernel
-    sprintf(kernel_string, kernel_code_long,
-            size_strings[size_to_test], size_strings[size_to_test], size_strings[size_to_test],
-            convert_string, convert_string
-    );
-
-    ptr = kernel_string;
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&ptr, "test_kernel_long");
-    test_error(error, "create single kernel failed");
-
-    // Set the arguments
-    for (count = 0; count < 2; count++) {
-      switch (count) {
-        case 0: error = clSetKernelArg(kernel, count, sizeof(cl_long)*sizes[size_to_test], &l); break;
-        case 1: error = clSetKernelArg(kernel, count, sizeof(cl_ulong)*sizes[size_to_test], &ul); break;
-        default: log_error("Test error"); break;
-      }
-      if (error)
-        log_error("Setting kernel arg %d %s%s: ", count, types[count], size_strings[size_to_test]);
-      test_error(error, "clSetKernelArgs failed");
-    }
-    error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &results);
-    test_error(error, "clSetKernelArgs failed");
-
-    // Execute
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
-    test_error(error, "clEnqueueNDRangeKernel failed");
-
-    // Read back the results
-    error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0, sizeof(cl_float)*2*16, results_back, 0, NULL, NULL);
-    test_error(error, "clEnqueueReadBuffer failed");
-
-    // Verify the results
-    for (count = 0; count < 2; count++) {
-      for (index=0; index < sizes[size_to_test]; index++) {
-        switch (count) {
-          case 0: expected = (float)l[index]; break;
-          case 1: expected = (float)ul[index]; break;
-          default: log_error("Test error"); break;
+        // If size is > 1, then we need a explicit convert call.
+        if (sizes[size_to_test] > 1)
+        {
+            sprintf(convert_string, "convert_float%s",
+                    size_strings[size_to_test]);
+        }
+        else
+        {
+            sprintf(convert_string, " ");
         }
 
-        if (results_back[count*sizes[size_to_test]+index] != expected) {
-          total_errors++;
-          log_error("Conversion from %s%s failed: index %d got %g, expected %g.\n", types[count], size_strings[size_to_test],
-                    index, results_back[count*sizes[size_to_test]+index], expected);
+        // Build the kernel
+        sprintf(kernel_string, kernel_code_long, size_strings[size_to_test],
+                size_strings[size_to_test], size_strings[size_to_test],
+                convert_string, convert_string);
+
+        ptr = kernel_string;
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            (const char **)&ptr,
+                                            "test_kernel_long");
+        test_error(error, "create single kernel failed");
+
+        // Set the arguments
+        for (count = 0; count < 2; count++)
+        {
+            switch (count)
+            {
+                case 0:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_long) * sizes[size_to_test],
+                        &l);
+                    break;
+                case 1:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_ulong) * sizes[size_to_test],
+                        &ul);
+                    break;
+                default: log_error("Test error"); break;
+            }
+            if (error)
+                log_error("Setting kernel arg %d %s%s: ", count, types[count],
+                          size_strings[size_to_test]);
+            test_error(error, "clSetKernelArgs failed");
+        }
+        error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &results);
+        test_error(error, "clSetKernelArgs failed");
+
+        // Execute
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0,
+                                       NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+
+        // Read back the results
+        error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0,
+                                    sizeof(cl_float) * 2 * 16, results_back, 0,
+                                    NULL, NULL);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        // Verify the results
+        for (count = 0; count < 2; count++)
+        {
+            for (index = 0; index < sizes[size_to_test]; index++)
+            {
+                switch (count)
+                {
+                    case 0: expected = (float)l[index]; break;
+                    case 1: expected = (float)ul[index]; break;
+                    default: log_error("Test error"); break;
+                }
+
+                if (results_back[count * sizes[size_to_test] + index]
+                    != expected)
+                {
+                    total_errors++;
+                    log_error("Conversion from %s%s failed: index %d got %g, "
+                              "expected %g.\n",
+                              types[count], size_strings[size_to_test], index,
+                              results_back[count * sizes[size_to_test] + index],
+                              expected);
+                }
+            }
         }
-      }
     }
-  }
 
-  return total_errors;
+    return total_errors;
 }
 
-int test_parameter_types(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+int test_parameter_types(cl_device_id device, cl_context context,
+                         cl_command_queue queue, int num_elements)
 {
-  clMemWrapper results;
-  int error;
-  size_t global[3] = {1, 1, 1};
-  float results_back[7*16];
-  int count, index;
-  const char* types[] = {"char", "uchar", "short", "ushort", "int", "uint", "float"};
-  char kernel_string[8192];
-  int sizes[] = {1, 2, 4, 8, 16};
-  const char* size_strings[] = {"", "2", "4", "8", "16"};
-  float expected;
-  int total_errors = 0;
-  int size_to_test;
-  char *ptr;
-  char convert_string[1024];
-  size_t max_parameter_size;
-
-  // We don't really care about the contents since we're just testing that the types work.
-  cl_char c[16]={0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
-  cl_uchar uc[16]={16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-  cl_short s[16]={-17,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
-  cl_ushort us[16]={18,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-  cl_int i[16]={-19,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
-  cl_uint ui[16]={20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-  cl_float f[16]={-23,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
-
-  // Calculate how large our paramter size is to the kernel
-  size_t parameter_size = sizeof(cl_char) + sizeof(cl_uchar) +
-  sizeof(cl_short) +sizeof(cl_ushort) +
-  sizeof(cl_int) +sizeof(cl_uint) +
-  sizeof(cl_float);
-
-  // Init our strings.
-  kernel_string[0] = '\0';
-  convert_string[0] = '\0';
-
-  // Get the maximum parameter size allowed
-  error = clGetDeviceInfo( device, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( max_parameter_size ), &max_parameter_size, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
-
-  // Create the results buffer
-  results = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float)*7*16, NULL, &error);
-  test_error(error, "clCreateBuffer failed");
-
-  // Go over all the vector sizes
-  for (size_to_test = 0; size_to_test < 5; size_to_test++) {
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-
-    size_t total_parameter_size = parameter_size*sizes[size_to_test] + sizeof(cl_mem);
-    if (total_parameter_size > max_parameter_size) {
-      log_info("Can not test with vector size %d because it would exceed the maximum allowed parameter size to the kernel. (%d > %d)\n",
-               (int)sizes[size_to_test], (int)total_parameter_size, (int)max_parameter_size);
-      continue;
-    }
-
-    log_info("Testing vector size %d\n", sizes[size_to_test]);
+    clMemWrapper results;
+    int error;
+    size_t global[3] = { 1, 1, 1 };
+    float results_back[7 * 16];
+    int count, index;
+    const char *types[] = { "char", "uchar", "short", "ushort",
+                            "int",  "uint",  "float" };
+    char kernel_string[8192];
+    int sizes[] = { 1, 2, 4, 8, 16 };
+    const char *size_strings[] = { "", "2", "4", "8", "16" };
+    float expected;
+    int total_errors = 0;
+    int size_to_test;
+    char *ptr;
+    char convert_string[1024];
+    size_t max_parameter_size;
+
+    // We don't really care about the contents since we're just testing that the
+    // types work.
+    cl_char c[16] = { 0, -1, 2,  -3,  4,  -5,  6,  -7,
+                      8, -9, 10, -11, 12, -13, 14, -15 };
+    cl_uchar uc[16] = { 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+    cl_short s[16] = { -17, -1, 2,  -3,  4,  -5,  6,  -7,
+                       8,   -9, 10, -11, 12, -13, 14, -15 };
+    cl_ushort us[16] = {
+        18, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    };
+    cl_int i[16] = { -19, -1, 2,  -3,  4,  -5,  6,  -7,
+                     8,   -9, 10, -11, 12, -13, 14, -15 };
+    cl_uint ui[16] = { 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+    cl_float f[16] = { -23, -1, 2,  -3,  4,  -5,  6,  -7,
+                       8,   -9, 10, -11, 12, -13, 14, -15 };
+
+    // Calculate how large our paramter size is to the kernel
+    size_t parameter_size = sizeof(cl_char) + sizeof(cl_uchar)
+        + sizeof(cl_short) + sizeof(cl_ushort) + sizeof(cl_int)
+        + sizeof(cl_uint) + sizeof(cl_float);
+
+    // Init our strings.
+    kernel_string[0] = '\0';
+    convert_string[0] = '\0';
+
+    // Get the maximum parameter size allowed
+    error =
+        clGetDeviceInfo(device, CL_DEVICE_MAX_PARAMETER_SIZE,
+                        sizeof(max_parameter_size), &max_parameter_size, NULL);
+    test_error(error, "Unable to get max parameter size from device");
+
+    // Create the results buffer
+    results = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                             sizeof(cl_float) * 7 * 16, NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    // Go over all the vector sizes
+    for (size_to_test = 0; size_to_test < 5; size_to_test++)
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        size_t total_parameter_size =
+            parameter_size * sizes[size_to_test] + sizeof(cl_mem);
+        if (total_parameter_size > max_parameter_size)
+        {
+            log_info(
+                "Can not test with vector size %d because it would exceed the "
+                "maximum allowed parameter size to the kernel. (%d > %d)\n",
+                (int)sizes[size_to_test], (int)total_parameter_size,
+                (int)max_parameter_size);
+            continue;
+        }
 
-    // If size is > 1, then we need a explicit convert call.
-    if (sizes[size_to_test] > 1) {
-      sprintf(convert_string, "convert_float%s",  size_strings[size_to_test]);
-    } else {
-      sprintf(convert_string, " ");
-    }
+        log_info("Testing vector size %d\n", sizes[size_to_test]);
 
-    // Build the kernel
-    sprintf(kernel_string, kernel_code,
-            size_strings[size_to_test], size_strings[size_to_test], size_strings[size_to_test],
-            size_strings[size_to_test], size_strings[size_to_test], size_strings[size_to_test],
-            size_strings[size_to_test], size_strings[size_to_test],
-            convert_string, convert_string, convert_string,
-            convert_string, convert_string, convert_string
-    );
-
-    ptr = kernel_string;
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&ptr, "test_kernel");
-    test_error(error, "create single kernel failed");
-
-    // Set the arguments
-    for (count = 0; count < 7; count++) {
-      switch (count) {
-        case 0: error = clSetKernelArg(kernel, count, sizeof(cl_char)*sizes[size_to_test], &c); break;
-        case 1: error = clSetKernelArg(kernel, count, sizeof(cl_uchar)*sizes[size_to_test], &uc); break;
-        case 2: error = clSetKernelArg(kernel, count, sizeof(cl_short)*sizes[size_to_test], &s); break;
-        case 3: error = clSetKernelArg(kernel, count, sizeof(cl_ushort)*sizes[size_to_test], &us); break;
-        case 4: error = clSetKernelArg(kernel, count, sizeof(cl_int)*sizes[size_to_test], &i); break;
-        case 5: error = clSetKernelArg(kernel, count, sizeof(cl_uint)*sizes[size_to_test], &ui); break;
-        case 6: error = clSetKernelArg(kernel, count, sizeof(cl_float)*sizes[size_to_test], &f); break;
-        default: log_error("Test error"); break;
-      }
-      if (error)
-        log_error("Setting kernel arg %d %s%s: ", count, types[count], size_strings[size_to_test]);
-      test_error(error, "clSetKernelArgs failed");
-    }
-    error = clSetKernelArg(kernel, 7, sizeof(cl_mem), &results);
-    test_error(error, "clSetKernelArgs failed");
-
-    // Execute
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
-    test_error(error, "clEnqueueNDRangeKernel failed");
-
-    // Read back the results
-    error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0, sizeof(cl_float)*7*16, results_back, 0, NULL, NULL);
-    test_error(error, "clEnqueueReadBuffer failed");
-
-    // Verify the results
-    for (count = 0; count < 7; count++) {
-      for (index=0; index < sizes[size_to_test]; index++) {
-        switch (count) {
-          case 0: expected = (float)c[index]; break;
-          case 1: expected = (float)uc[index]; break;
-          case 2: expected = (float)s[index]; break;
-          case 3: expected = (float)us[index]; break;
-          case 4: expected = (float)i[index]; break;
-          case 5: expected = (float)ui[index]; break;
-          case 6: expected = (float)f[index]; break;
-          default: log_error("Test error"); break;
+        // If size is > 1, then we need a explicit convert call.
+        if (sizes[size_to_test] > 1)
+        {
+            sprintf(convert_string, "convert_float%s",
+                    size_strings[size_to_test]);
+        }
+        else
+        {
+            sprintf(convert_string, " ");
         }
 
-        if (results_back[count*sizes[size_to_test]+index] != expected) {
-          total_errors++;
-          log_error("Conversion from %s%s failed: index %d got %g, expected %g.\n", types[count], size_strings[size_to_test],
-                    index, results_back[count*sizes[size_to_test]+index], expected);
+        // Build the kernel
+        sprintf(kernel_string, kernel_code, size_strings[size_to_test],
+                size_strings[size_to_test], size_strings[size_to_test],
+                size_strings[size_to_test], size_strings[size_to_test],
+                size_strings[size_to_test], size_strings[size_to_test],
+                size_strings[size_to_test], convert_string, convert_string,
+                convert_string, convert_string, convert_string, convert_string);
+
+        ptr = kernel_string;
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            (const char **)&ptr, "test_kernel");
+        test_error(error, "create single kernel failed");
+
+        // Set the arguments
+        for (count = 0; count < 7; count++)
+        {
+            switch (count)
+            {
+                case 0:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_char) * sizes[size_to_test],
+                        &c);
+                    break;
+                case 1:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_uchar) * sizes[size_to_test],
+                        &uc);
+                    break;
+                case 2:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_short) * sizes[size_to_test],
+                        &s);
+                    break;
+                case 3:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_ushort) * sizes[size_to_test],
+                        &us);
+                    break;
+                case 4:
+                    error = clSetKernelArg(kernel, count,
+                                           sizeof(cl_int) * sizes[size_to_test],
+                                           &i);
+                    break;
+                case 5:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_uint) * sizes[size_to_test],
+                        &ui);
+                    break;
+                case 6:
+                    error = clSetKernelArg(
+                        kernel, count, sizeof(cl_float) * sizes[size_to_test],
+                        &f);
+                    break;
+                default: log_error("Test error"); break;
+            }
+            if (error)
+                log_error("Setting kernel arg %d %s%s: ", count, types[count],
+                          size_strings[size_to_test]);
+            test_error(error, "clSetKernelArgs failed");
+        }
+        error = clSetKernelArg(kernel, 7, sizeof(cl_mem), &results);
+        test_error(error, "clSetKernelArgs failed");
+
+        // Execute
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0,
+                                       NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+
+        // Read back the results
+        error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0,
+                                    sizeof(cl_float) * 7 * 16, results_back, 0,
+                                    NULL, NULL);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        // Verify the results
+        for (count = 0; count < 7; count++)
+        {
+            for (index = 0; index < sizes[size_to_test]; index++)
+            {
+                switch (count)
+                {
+                    case 0: expected = (float)c[index]; break;
+                    case 1: expected = (float)uc[index]; break;
+                    case 2: expected = (float)s[index]; break;
+                    case 3: expected = (float)us[index]; break;
+                    case 4: expected = (float)i[index]; break;
+                    case 5: expected = (float)ui[index]; break;
+                    case 6: expected = (float)f[index]; break;
+                    default: log_error("Test error"); break;
+                }
+
+                if (results_back[count * sizes[size_to_test] + index]
+                    != expected)
+                {
+                    total_errors++;
+                    log_error("Conversion from %s%s failed: index %d got %g, "
+                              "expected %g.\n",
+                              types[count], size_strings[size_to_test], index,
+                              results_back[count * sizes[size_to_test] + index],
+                              expected);
+                }
+            }
         }
-      }
     }
-  }
 
-  if (gHasLong) {
-    log_info("Testing long types...\n");
-    total_errors += test_parameter_types_long( device, context, queue, num_elements );
-  }
-  else {
-    log_info("Longs unsupported, skipping.");
-  }
+    if (gHasLong)
+    {
+        log_info("Testing long types...\n");
+        total_errors +=
+            test_parameter_types_long(device, context, queue, num_elements);
+    }
+    else
+    {
+        log_info("Longs unsupported, skipping.");
+    }
 
-  return total_errors;
+    return total_errors;
 }
-
-
-

From 5d7be40e68039bb7c43769e32b14b62862ad475c Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 11 Feb 2021 11:37:38 +0000
Subject: [PATCH 040/158] Remove undesired conversions from size_t to int
 (#1153)

Improve math_brute_force kernels by consistently using size_t to store
the result of get_global_id().

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary.cpp               | 4 ++--
 test_conformance/math_brute_force/binary_i.cpp             | 4 ++--
 test_conformance/math_brute_force/binary_two_results_i.cpp | 4 ++--
 test_conformance/math_brute_force/i_unary.cpp              | 2 +-
 test_conformance/math_brute_force/macro_binary.cpp         | 4 ++--
 test_conformance/math_brute_force/macro_unary.cpp          | 4 ++--
 test_conformance/math_brute_force/mad.cpp                  | 4 ++--
 test_conformance/math_brute_force/ternary.cpp              | 4 ++--
 test_conformance/math_brute_force/unary.cpp                | 4 ++--
 test_conformance/math_brute_force/unary_two_results.cpp    | 4 ++--
 test_conformance/math_brute_force/unary_two_results_i.cpp  | 4 ++--
 test_conformance/math_brute_force/unary_u.cpp              | 4 ++--
 12 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index e6b9cbbc35..c428c5e455 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -49,7 +49,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i] );\n"
@@ -136,7 +136,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i] );\n"
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index f931c5be49..99504c745b 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -39,7 +39,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i] );\n"
@@ -126,7 +126,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i] );\n"
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index 2ecf1c287f..4686f7e91e 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -42,7 +42,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i], out2 + i );\n"
@@ -135,7 +135,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i], out2[i] );\n"
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index b736b24f1f..cd5e2fa845 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -116,7 +116,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index 0c37068e16..e6b4c17a33 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -38,7 +38,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i] );\n"
@@ -125,7 +125,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* in2 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i] );\n"
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index ced72be816..b2bcd8f5d5 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -36,7 +36,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
@@ -118,7 +118,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index 872caa0bee..845f7ed4df 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -39,7 +39,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in3 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
@@ -131,7 +131,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in3 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index 1b03b209a0..f05c605aca 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -43,7 +43,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in3 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
@@ -135,7 +135,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in3 )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 5085b9b4d7..54b409f56d 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -39,7 +39,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
@@ -120,7 +120,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index a3be1d8d5e..28fbf19428 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -38,7 +38,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
@@ -124,7 +124,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index 6c56ed1f94..2eb2ef3a1c 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -39,7 +39,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
@@ -125,7 +125,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index df6724cafe..26ceb683ce 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -35,7 +35,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
@@ -115,7 +115,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in )\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"

From b398f8c1b2ddf6f8e19eaeb564c1f78bde8a08db Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 11 Feb 2021 11:38:01 +0000
Subject: [PATCH 041/158] Fix typo in error handling (#1152)

When creating sub-buffers in math_brute_force, the wrong variables were
checked for runtime errors. This patch fixes this and ensures the logs
are consistent.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary.cpp             | 5 ++---
 test_conformance/math_brute_force/binaryOperator.cpp     | 8 ++++----
 test_conformance/math_brute_force/binary_i.cpp           | 9 +++++----
 .../math_brute_force/binary_two_results_i.cpp            | 4 ++--
 test_conformance/math_brute_force/macro_binary.cpp       | 8 ++++----
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index c428c5e455..3caa89d236 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -1342,7 +1342,6 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
-
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -1412,9 +1411,9 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
                        region.origin, region.size);
             goto exit;
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index 0957c6af62..8763e688cc 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -471,9 +471,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
                        region.origin, region.size);
             goto exit;
@@ -1349,9 +1349,9 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
                        region.origin, region.size);
             goto exit;
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index 99504c745b..80ddbed411 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -468,9 +468,9 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
                        region.origin, region.size);
             goto exit;
@@ -1089,6 +1089,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -1158,9 +1159,9 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
                        region.origin, region.size);
             goto exit;
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index 4686f7e91e..ee87bc84d6 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -713,7 +713,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
@@ -1216,7 +1216,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index e6b4c17a33..aa83a58856 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -444,9 +444,9 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
                        region.origin, region.size);
             goto exit;
@@ -1131,9 +1131,9 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
                        region.origin, region.size);
             goto exit;

From 901f4db44960703a04f760fc939d5f79bd43b626 Mon Sep 17 00:00:00 2001
From: doe300 <doe300@users.noreply.github.com>
Date: Wed, 17 Feb 2021 09:57:02 +0100
Subject: [PATCH 042/158] Fix compilation error in rtz mode detection (#1163)

---
 test_conformance/math_brute_force/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 1b63c8288a..82248da20f 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -1658,7 +1658,7 @@ static int IsInRTZMode(void)
             volatile float a = 0x1.0p23f;
             volatile float b = -0x1.0p23f;
             out[0] = (a + 0x1.fffffep-1f == a) && (b - 0x1.fffffep-1f == b);
-        "})";
+        })";
 
     clProgramWrapper query;
     clKernelWrapper kernel;

From 5d2a4bc522993db364bd3f9c9dbbacf1cbee8ea2 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 17 Feb 2021 09:17:34 +0000
Subject: [PATCH 043/158] Reduce difference between files (#1155)

Moves `if (!gSkipCorrectnessTesting)` in binary_i.cpp to follow
structure of binary.cpp, unary.cpp, macro_unary.cpp and
macro_binary.cpp.

Other non-semantic changes include adding/removing new lines and
updating comments.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary.cpp  |  1 +
 .../math_brute_force/binaryOperator.cpp       |  5 +-
 .../math_brute_force/binary_i.cpp             | 49 +++++++++----------
 .../math_brute_force/macro_binary.cpp         |  5 +-
 .../math_brute_force/macro_unary.cpp          |  3 +-
 test_conformance/math_brute_force/mad.cpp     |  4 +-
 test_conformance/math_brute_force/unary.cpp   |  2 +-
 7 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index 3caa89d236..c6c49359f5 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -634,6 +634,7 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index 8763e688cc..c99cf87027 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -567,7 +567,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -633,6 +632,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -1147,7 +1147,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     return error;
 }
 
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
@@ -1437,6 +1436,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -1444,7 +1444,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index 80ddbed411..db2bf8e19e 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -513,24 +513,23 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     }
 
     // Run the kernels
-    error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
-
-
-    // Accumulate the arithmetic errors
-    for (i = 0; i < test_info.threadCount; i++)
+    if (!gSkipCorrectnessTesting)
     {
-        if (test_info.tinfo[i].maxError > maxError)
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            maxError = test_info.tinfo[i].maxError;
-            maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
         }
-    }
 
-    if (error) goto exit;
+        if (error) goto exit;
 
-    if (!gSkipCorrectnessTesting)
-    {
         if (gWimpyMode)
             vlog("Wimp pass");
         else
@@ -562,7 +561,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -628,6 +626,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -1205,24 +1204,22 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 
     // Run the kernels
     if (!gSkipCorrectnessTesting)
+    {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
-
-    // Accumulate the arithmetic errors
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        if (test_info.tinfo[i].maxError > maxError)
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            maxError = test_info.tinfo[i].maxError;
-            maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
         }
-    }
 
-    if (error) goto exit;
+        if (error) goto exit;
 
-    if (!gSkipCorrectnessTesting)
-    {
         if (gWimpyMode)
             vlog("Wimp pass");
         else
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index aa83a58856..46ba413e4f 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -526,7 +526,6 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -590,6 +589,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -934,7 +934,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     return error;
 }
 
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
@@ -1207,6 +1206,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -1214,7 +1214,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index b2bcd8f5d5..1c8275c06c 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -441,6 +441,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -507,7 +508,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    // Write the new values to the input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
 
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index 845f7ed4df..f1b13bd3b4 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -676,18 +676,21 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -695,7 +698,6 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 54b409f56d..0a00772df9 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -629,7 +629,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
 
-        // Run the kernel
+        // run the kernel
         size_t vectorCount =
             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its

From c67aa0535b24bea391a6a13bfa77ab20c4a86be5 Mon Sep 17 00:00:00 2001
From: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com>
Date: Wed, 17 Feb 2021 09:19:06 +0000
Subject: [PATCH 044/158] Fix Minor memory leaks in test_buffer/compiler.
 (#1160)

* Fix Minor memory leaks in test_buffer/compiler.

Signed-off-by: John Kesapides <john.kesapides@arm.com>

* Fixes in test_buffer
* Remove commented unmap call
* remove unused ii variables.

Signed-off-by: John Kesapides <john.kesapides@arm.com>

* test_buffer fixes:
* test_buffer_fill remove unsued ii

Signed-off-by: John Kesapides <john.kesapides@arm.com>
---
 test_conformance/buffers/test_buffer_fill.cpp |  50 ++++----
 .../buffers/test_buffer_write.cpp             | 107 ++++++++++++------
 .../test_compiler_defines_for_extensions.cpp  |  10 +-
 3 files changed, 104 insertions(+), 63 deletions(-)

diff --git a/test_conformance/buffers/test_buffer_fill.cpp b/test_conformance/buffers/test_buffer_fill.cpp
index 8e0b93fa97..3edfafbc38 100644
--- a/test_conformance/buffers/test_buffer_fill.cpp
+++ b/test_conformance/buffers/test_buffer_fill.cpp
@@ -562,14 +562,13 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
                      int loops, void *inptr[5], void *hostptr[5], void *pattern[5], size_t offset_elements, size_t fill_elements,
                      const char *kernelCode[], const char *kernelName[], int (*fn)(void *,void *,int) )
 {
-    clMemWrapper buffers[10];
     void        *outptr[5];
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
     size_t      ptrSizes[5];
     size_t      global_work_size[3];
     int         err;
-    int         i, ii;
+    int i;
     int         src_flag_id;
     int         total_errors = 0;
 
@@ -586,8 +585,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
     loops = (loops < 5 ? loops : 5);
     for (i = 0; i < loops; i++)
     {
-        ii = i << 1;
-
         err = create_single_kernel_helper(context, &program[i], &kernel[i], 1,
                                           &kernelCode[i], kernelName[i]);
         if (err)
@@ -599,18 +596,25 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
             clEventWrapper event[2];
-
+            clMemWrapper buffers[2];
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
-                buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, hostptr[i], &err);
+                buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],
+                                            ptrSizes[i] * num_elements,
+                                            hostptr[i], &err);
             else
-                buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
-            if ( !buffers[ii] || err){
+                buffers[0] =
+                    clCreateBuffer(context, flag_set[src_flag_id],
+                                   ptrSizes[i] * num_elements, NULL, &err);
+            if (!buffers[0] || err)
+            {
                 print_error(err, "clCreateBuffer failed\n" );
                 return -1;
             }
             // Initialize source buffer with 0, since the validation code expects 0(s) outside of the fill region.
             if (!((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))) {
-                err = clEnqueueWriteBuffer(queue, buffers[ii], CL_FALSE, 0, ptrSizes[i]*num_elements, hostptr[i], 0, NULL, NULL);
+                err = clEnqueueWriteBuffer(queue, buffers[0], CL_FALSE, 0,
+                                           ptrSizes[i] * num_elements,
+                                           hostptr[i], 0, NULL, NULL);
                 if ( err != CL_SUCCESS ){
                     print_error(err, "clEnqueueWriteBuffer failed\n" );
                     return -1;
@@ -619,27 +623,31 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
 
             outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
             memset(outptr[i], 0, ptrSizes[i] * num_elements);
-            buffers[ii+1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,  ptrSizes[i] * num_elements, outptr[i], &err);
-            if ( !buffers[ii+1] || err){
+            buffers[1] =
+                clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                               ptrSizes[i] * num_elements, outptr[i], &err);
+            if (!buffers[1] || err)
+            {
                 print_error(err, "clCreateBuffer failed\n" );
                 align_free( outptr[i] );
                 return -1;
             }
 
-            err = clEnqueueFillBuffer(queue, buffers[ii], pattern[i], ptrSizes[i],
-                                      ptrSizes[i] * offset_elements, ptrSizes[i] * fill_elements,
-                                      0, NULL, &(event[0]));
-            /* uncomment for test debugging
-             err = clEnqueueWriteBuffer(queue, buffers[ii], CL_FALSE, 0, ptrSizes[i]*num_elements, inptr[i], 0, NULL, &(event[0]));
-             */
+            err = clEnqueueFillBuffer(
+                queue, buffers[0], pattern[i], ptrSizes[i],
+                ptrSizes[i] * offset_elements, ptrSizes[i] * fill_elements, 0,
+                NULL, &(event[0]));
+
             if ( err != CL_SUCCESS ){
                 print_error( err, " clEnqueueFillBuffer failed" );
                 align_free( outptr[i] );
                 return -1;
             }
 
-            err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[ii] );
-            err |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), (void *)&buffers[ii+1] );
+            err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem),
+                                 (void *)&buffers[0]);
+            err |= clSetKernelArg(kernel[i], 1, sizeof(cl_mem),
+                                  (void *)&buffers[1]);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed" );
                 align_free( outptr[i] );
@@ -659,7 +667,9 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
                 return -1;
             }
 
-            err = clEnqueueReadBuffer( queue, buffers[ii+1], false, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, &(event[1]) );
+            err = clEnqueueReadBuffer(queue, buffers[1], false, 0,
+                                      ptrSizes[i] * num_elements, outptr[i], 0,
+                                      NULL, &(event[1]));
             if (err != CL_SUCCESS){
                 print_error( err, "clEnqueueReadBuffer failed" );
                 return -1;
diff --git a/test_conformance/buffers/test_buffer_write.cpp b/test_conformance/buffers/test_buffer_write.cpp
index 2497dd1765..e57e1c18fe 100644
--- a/test_conformance/buffers/test_buffer_write.cpp
+++ b/test_conformance/buffers/test_buffer_write.cpp
@@ -624,14 +624,13 @@ static int verify_write_struct( void *ptr1, void *ptr2, int n )
 int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                        void *inptr[5], const char *kernelCode[], const char *kernelName[], int (*fn)(void *,void *,int), MTdata d )
 {
-    clMemWrapper buffers[10];
     void        *outptr[5];
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
     size_t      ptrSizes[5];
     size_t      global_work_size[3];
     cl_int      err;
-    int         i, ii;
+    int i;
     int         src_flag_id, dst_flag_id;
     int         total_errors = 0;
 
@@ -660,13 +659,19 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
         {
             for (dst_flag_id = 0; dst_flag_id < NUM_FLAGS; dst_flag_id++)
             {
-                ii = i << 1;
+                clMemWrapper buffers[2];
+
                 if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
-                    buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, inptr[i], &err);
+                    buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],
+                                                ptrSizes[i] * num_elements,
+                                                inptr[i], &err);
                 else
-                    buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
+                    buffers[0] =
+                        clCreateBuffer(context, flag_set[src_flag_id],
+                                       ptrSizes[i] * num_elements, NULL, &err);
 
-                if ( ! buffers[ii] || err){
+                if (!buffers[0] || err)
+                {
                     align_free( outptr[i] );
                     print_error(err, " clCreateBuffer failed\n" );
                     return -1;
@@ -674,19 +679,26 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                 if ( ! strcmp( type, "half" ) ){
                     outptr[i] = align_malloc( ptrSizes[i] * (num_elements * 2 ), min_alignment);
                     if ((flag_set[dst_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[dst_flag_id] & CL_MEM_COPY_HOST_PTR))
-                        buffers[ii+1] = clCreateBuffer(context, flag_set[dst_flag_id],  ptrSizes[i] * 2 * num_elements, outptr[i], &err);
+                        buffers[1] = clCreateBuffer(
+                            context, flag_set[dst_flag_id],
+                            ptrSizes[i] * 2 * num_elements, outptr[i], &err);
                     else
-                        buffers[ii+1] = clCreateBuffer(context, flag_set[dst_flag_id],  ptrSizes[i] * 2 * num_elements, NULL, &err);
+                        buffers[1] = clCreateBuffer(
+                            context, flag_set[dst_flag_id],
+                            ptrSizes[i] * 2 * num_elements, NULL, &err);
                 }
                 else{
                     outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
                     if ((flag_set[dst_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[dst_flag_id] & CL_MEM_COPY_HOST_PTR))
-                        buffers[ii+1] = clCreateBuffer(context, flag_set[dst_flag_id],  ptrSizes[i] * num_elements, outptr[i], &err);
+                        buffers[1] = clCreateBuffer(
+                            context, flag_set[dst_flag_id],
+                            ptrSizes[i] * num_elements, outptr[i], &err);
                     else
-                        buffers[ii+1] = clCreateBuffer(context, flag_set[dst_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
+                        buffers[1] = clCreateBuffer(
+                            context, flag_set[dst_flag_id],
+                            ptrSizes[i] * num_elements, NULL, &err);
                 }
                 if ( err ){
-                    clReleaseMemObject(buffers[ii]);
                     align_free( outptr[i] );
                     print_error(err, " clCreateBuffer failed\n" );
                     return -1;
@@ -694,7 +706,9 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
 
                 if (gTestMap) {
                     void *dataPtr;
-                    dataPtr = clEnqueueMapBuffer(queue, buffers[ii], CL_TRUE, CL_MAP_WRITE, 0, ptrSizes[i]*num_elements, 0, NULL, NULL, &err);
+                    dataPtr = clEnqueueMapBuffer(
+                        queue, buffers[0], CL_TRUE, CL_MAP_WRITE, 0,
+                        ptrSizes[i] * num_elements, 0, NULL, NULL, &err);
                     if (err) {
                         print_error(err, "clEnqueueMapBuffer failed");
                         align_free( outptr[i] );
@@ -703,7 +717,8 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
 
                     memcpy(dataPtr, inptr[i], ptrSizes[i]*num_elements);
 
-                    err = clEnqueueUnmapMemObject(queue, buffers[ii], dataPtr, 0, NULL, NULL);
+                    err = clEnqueueUnmapMemObject(queue, buffers[0], dataPtr, 0,
+                                                  NULL, NULL);
                     if (err) {
                         print_error(err, "clEnqueueUnmapMemObject failed");
                         align_free( outptr[i] );
@@ -711,7 +726,9 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                     }
                 }
                 else if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR)) {
-                    err = clEnqueueWriteBuffer(queue, buffers[ii], CL_TRUE, 0, ptrSizes[i]*num_elements, inptr[i], 0, NULL, NULL);
+                    err = clEnqueueWriteBuffer(queue, buffers[0], CL_TRUE, 0,
+                                               ptrSizes[i] * num_elements,
+                                               inptr[i], 0, NULL, NULL);
                     if ( err != CL_SUCCESS ){
                         align_free( outptr[i] );
                         print_error( err, " clWriteBuffer failed" );
@@ -719,8 +736,10 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                     }
                 }
 
-                err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[ii] );
-                err |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), (void *)&buffers[ii+1] );
+                err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem),
+                                     (void *)&buffers[0]);
+                err |= clSetKernelArg(kernel[i], 1, sizeof(cl_mem),
+                                      (void *)&buffers[1]);
                 if ( err != CL_SUCCESS ){
                     align_free( outptr[i] );
                     print_error( err, " clSetKernelArg failed" );
@@ -734,12 +753,10 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
                     return -1;
                 }
 
-                if ( ! strcmp( type, "half" ) ){
-                    err = clEnqueueReadBuffer( queue, buffers[ii+1], true, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, NULL );
-                }
-                else{
-                    err = clEnqueueReadBuffer( queue, buffers[ii+1], true, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, NULL );
-                }
+                err = clEnqueueReadBuffer(queue, buffers[1], true, 0,
+                                          ptrSizes[i] * num_elements, outptr[i],
+                                          0, NULL, NULL);
+
                 if ( err != CL_SUCCESS ){
                     align_free( outptr[i] );
                     print_error( err, " clEnqueueReadBuffer failed" );
@@ -774,7 +791,7 @@ int test_buffer_write( cl_device_id deviceID, cl_context context, cl_command_que
 
 int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
 {
-    clMemWrapper buffers[10];
+
     void        *outptr[5];
     TestStruct  *inptr[5];
     clProgramWrapper program[5];
@@ -783,7 +800,7 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
     size_t      size = sizeof( TestStruct );
     size_t      global_work_size[3];
     cl_int      err;
-    int         i, ii;
+    int i;
     cl_uint     j;
     int         loops = 1;      // no vector for structs
     int         src_flag_id, dst_flag_id;
@@ -818,6 +835,7 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
         {
             for (dst_flag_id = 0; dst_flag_id < NUM_FLAGS; dst_flag_id++)
             {
+                clMemWrapper buffers[2];
 
                 inptr[i] = (TestStruct *)align_malloc(ptrSizes[i] * num_elements, min_alignment);
 
@@ -826,11 +844,14 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                     inptr[i][j].b = get_random_float( -FLT_MAX, FLT_MAX, d );
                 }
 
-                ii = i << 1;
                 if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
-                    buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, inptr[i], &err);
+                    buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],
+                                                ptrSizes[i] * num_elements,
+                                                inptr[i], &err);
                 else
-                    buffers[ii] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
+                    buffers[0] =
+                        clCreateBuffer(context, flag_set[src_flag_id],
+                                       ptrSizes[i] * num_elements, NULL, &err);
                 if ( err ){
                     align_free( outptr[i] );
                     print_error(err, " clCreateBuffer failed\n" );
@@ -839,10 +860,15 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                 }
                 outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
                 if ((flag_set[dst_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[dst_flag_id] & CL_MEM_COPY_HOST_PTR))
-                    buffers[ii+1] = clCreateBuffer(context, flag_set[dst_flag_id],  ptrSizes[i] * num_elements, outptr[i], &err);
+                    buffers[1] = clCreateBuffer(context, flag_set[dst_flag_id],
+                                                ptrSizes[i] * num_elements,
+                                                outptr[i], &err);
                 else
-                    buffers[ii+1] = clCreateBuffer(context, flag_set[dst_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
-                if ( ! buffers[ii+1] || err){
+                    buffers[1] =
+                        clCreateBuffer(context, flag_set[dst_flag_id],
+                                       ptrSizes[i] * num_elements, NULL, &err);
+                if (!buffers[1] || err)
+                {
                     align_free( outptr[i] );
                     print_error(err, " clCreateBuffer failed\n" );
                     free_mtdata(d);
@@ -851,7 +877,9 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
 
                 if (gTestMap) {
                     void *dataPtr;
-                    dataPtr = clEnqueueMapBuffer(queue, buffers[ii], CL_TRUE, CL_MAP_WRITE, 0, ptrSizes[i]*num_elements, 0, NULL, NULL, &err);
+                    dataPtr = clEnqueueMapBuffer(
+                        queue, buffers[0], CL_TRUE, CL_MAP_WRITE, 0,
+                        ptrSizes[i] * num_elements, 0, NULL, NULL, &err);
                     if (err) {
                         print_error(err, "clEnqueueMapBuffer failed");
                         align_free( outptr[i] );
@@ -861,7 +889,8 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
 
                     memcpy(dataPtr, inptr[i], ptrSizes[i]*num_elements);
 
-                    err = clEnqueueUnmapMemObject(queue, buffers[ii], dataPtr, 0, NULL, NULL);
+                    err = clEnqueueUnmapMemObject(queue, buffers[0], dataPtr, 0,
+                                                  NULL, NULL);
                     if (err) {
                         print_error(err, "clEnqueueUnmapMemObject failed");
                         align_free( outptr[i] );
@@ -870,7 +899,9 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                     }
                 }
                 else if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR)) {
-                    err = clEnqueueWriteBuffer(queue, buffers[ii], CL_TRUE, 0, ptrSizes[i]*num_elements, inptr[i], 0, NULL, NULL);
+                    err = clEnqueueWriteBuffer(queue, buffers[0], CL_TRUE, 0,
+                                               ptrSizes[i] * num_elements,
+                                               inptr[i], 0, NULL, NULL);
                     if ( err != CL_SUCCESS ){
                         align_free( outptr[i] );
                         print_error( err, " clWriteBuffer failed" );
@@ -879,8 +910,10 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                     }
                 }
 
-                err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[ii] );
-                err |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), (void *)&buffers[ii+1] );
+                err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem),
+                                     (void *)&buffers[0]);
+                err |= clSetKernelArg(kernel[i], 1, sizeof(cl_mem),
+                                      (void *)&buffers[1]);
                 if ( err != CL_SUCCESS ){
                     align_free( outptr[i] );
                     print_error( err, " clSetKernelArg failed" );
@@ -896,7 +929,9 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                     return -1;
                 }
 
-                err = clEnqueueReadBuffer( queue, buffers[ii+1], true, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, NULL );
+                err = clEnqueueReadBuffer(queue, buffers[1], true, 0,
+                                          ptrSizes[i] * num_elements, outptr[i],
+                                          0, NULL, NULL);
                 if ( err != CL_SUCCESS ){
                     align_free( outptr[i] );
                     print_error( err, " clEnqueueReadBuffer failed" );
diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index a4a825126b..483adac9a2 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -335,10 +335,10 @@ int test_compiler_defines_for_extensions(cl_device_id device, cl_context context
     strcat(kernel_code, kernel_strings[4]);
 
     // Now we need to execute the kernel
-    cl_mem defines;
+    clMemWrapper defines;
     cl_int *data;
-    cl_program program;
-    cl_kernel kernel;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
 
     Version version = get_device_cl_version(device);
 
@@ -432,10 +432,6 @@ int test_compiler_defines_for_extensions(cl_device_id device, cl_context context
       free(extensions_supported[i]);
     }
     free(extensions);
-    if( defines ) {
-        error = clReleaseMemObject( defines );
-        test_error( error, "Unable to release memory object" );
-    }
 
     if (total_errors)
         return -1;

From 7181bcdbac1dddb06cdcd713fe34cd3983acec7f Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 17 Feb 2021 17:05:09 +0000
Subject: [PATCH 045/158] Rename files for consistency (#1166)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/CMakeLists.txt          | 8 ++++----
 test_conformance/math_brute_force/binary.cpp              | 4 ++--
 test_conformance/math_brute_force/binary_i.cpp            | 4 ++--
 .../{binaryOperator.cpp => binary_operator.cpp}           | 4 ++--
 .../math_brute_force/binary_two_results_i.cpp             | 4 ++--
 .../{FunctionList.cpp => function_list.cpp}               | 2 +-
 .../math_brute_force/{FunctionList.h => function_list.h}  | 4 ++--
 test_conformance/math_brute_force/i_unary.cpp             | 4 ++--
 test_conformance/math_brute_force/macro_binary.cpp        | 4 ++--
 test_conformance/math_brute_force/macro_unary.cpp         | 4 ++--
 test_conformance/math_brute_force/mad.cpp                 | 4 ++--
 test_conformance/math_brute_force/main.cpp                | 6 +++---
 test_conformance/math_brute_force/reference_math.cpp      | 2 +-
 .../math_brute_force/{Sleep.cpp => sleep.cpp}             | 4 ++--
 test_conformance/math_brute_force/{Sleep.h => sleep.h}    | 0
 test_conformance/math_brute_force/ternary.cpp             | 4 ++--
 test_conformance/math_brute_force/unary.cpp               | 4 ++--
 test_conformance/math_brute_force/unary_two_results.cpp   | 4 ++--
 test_conformance/math_brute_force/unary_two_results_i.cpp | 4 ++--
 test_conformance/math_brute_force/unary_u.cpp             | 4 ++--
 .../math_brute_force/{Utility.cpp => utility.cpp}         | 6 +++---
 .../math_brute_force/{Utility.h => utility.h}             | 0
 test_conformance/spir/CMakeLists.txt                      | 2 +-
 test_conformance/spir/run_build_test.cpp                  | 6 +++---
 test_conformance/spirv_new/CMakeLists.txt                 | 4 ++--
 25 files changed, 48 insertions(+), 48 deletions(-)
 rename test_conformance/math_brute_force/{binaryOperator.cpp => binary_operator.cpp} (99%)
 rename test_conformance/math_brute_force/{FunctionList.cpp => function_list.cpp} (99%)
 rename test_conformance/math_brute_force/{FunctionList.h => function_list.h} (98%)
 rename test_conformance/math_brute_force/{Sleep.cpp => sleep.cpp} (98%)
 rename test_conformance/math_brute_force/{Sleep.h => sleep.h} (100%)
 rename test_conformance/math_brute_force/{Utility.cpp => utility.cpp} (98%)
 rename test_conformance/math_brute_force/{Utility.h => utility.h} (100%)

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index 8818039420..957233caa3 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -1,24 +1,24 @@
 set(MODULE_NAME BRUTEFORCE)
 
 set(${MODULE_NAME}_SOURCES
-    FunctionList.cpp
-    Sleep.cpp
     binary.cpp
-    binaryOperator.cpp
-    Utility.cpp
     binary_i.cpp
+    binary_operator.cpp
     binary_two_results_i.cpp
+    function_list.cpp
     i_unary.cpp
     macro_binary.cpp
     macro_unary.cpp
     mad.cpp
     main.cpp
     reference_math.cpp
+    sleep.cpp
     ternary.cpp
     unary.cpp
     unary_two_results.cpp
     unary_two_results_i.cpp
     unary_u.cpp
+    utility.cpp
 )
 
 if (NOT CMAKE_CL_64 AND NOT MSVC AND NOT ANDROID)
diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index c6c49359f5..343482cc36 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index db2bf8e19e..5929f1d9c7 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -13,11 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <limits.h>
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_Double_Double_Int(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binary_operator.cpp
similarity index 99%
rename from test_conformance/math_brute_force/binaryOperator.cpp
rename to test_conformance/math_brute_force/binary_operator.cpp
index c99cf87027..56cb8eb80b 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binary_operator.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
                                         bool relaxedMode);
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index ee87bc84d6..08ac3b4aa4 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -13,11 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <limits.h>
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/FunctionList.cpp b/test_conformance/math_brute_force/function_list.cpp
similarity index 99%
rename from test_conformance/math_brute_force/FunctionList.cpp
rename to test_conformance/math_brute_force/function_list.cpp
index c5185c6fa6..2076aa689b 100644
--- a/test_conformance/math_brute_force/FunctionList.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "FunctionList.h"
+#include "function_list.h"
 #include "reference_math.h"
 
 #define FTZ_ON 1
diff --git a/test_conformance/math_brute_force/FunctionList.h b/test_conformance/math_brute_force/function_list.h
similarity index 98%
rename from test_conformance/math_brute_force/FunctionList.h
rename to test_conformance/math_brute_force/function_list.h
index e47eb72923..38f739ce0e 100644
--- a/test_conformance/math_brute_force/FunctionList.h
+++ b/test_conformance/math_brute_force/function_list.h
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifndef FUNCTIONLIST_H
-#define FUNCTIONLIST_H
+#ifndef FUNCTION_LIST_H
+#define FUNCTION_LIST_H
 
 #include "harness/compat.h"
 
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index cd5e2fa845..038e769f1c 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_Int_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index 46ba413e4f..09e0c6eb8d 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestMacro_Int_Float_Float(const Func *f, MTdata, bool relaxedMode);
 int TestMacro_Int_Double_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index 1c8275c06c..34827c032c 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode);
 int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index f1b13bd3b4..b0247b94fa 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_mad(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 82248da20f..5074eb6289 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -13,14 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "sleep.h"
+#include "utility.h"
 
 #include <cstdio>
 #include <cstdlib>
 #include <string>
 #include <time.h>
-#include "FunctionList.h"
-#include "Sleep.h"
 
 #include "harness/errorHelpers.h"
 #include "harness/kernelHelpers.h"
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index cfa5417659..cd1967dbe0 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -21,7 +21,7 @@
 #include <string.h>
 #endif
 
-#include "Utility.h"
+#include "utility.h"
 
 #if defined(__SSE__)                                                           \
     || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
diff --git a/test_conformance/math_brute_force/Sleep.cpp b/test_conformance/math_brute_force/sleep.cpp
similarity index 98%
rename from test_conformance/math_brute_force/Sleep.cpp
rename to test_conformance/math_brute_force/sleep.cpp
index 7103779e41..2669381448 100644
--- a/test_conformance/math_brute_force/Sleep.cpp
+++ b/test_conformance/math_brute_force/sleep.cpp
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Sleep.h"
-#include "Utility.h"
+#include "sleep.h"
+#include "utility.h"
 
 #if defined(__APPLE__)
 #include <IOKit/pwr_mgt/IOPMLib.h>
diff --git a/test_conformance/math_brute_force/Sleep.h b/test_conformance/math_brute_force/sleep.h
similarity index 100%
rename from test_conformance/math_brute_force/Sleep.h
rename to test_conformance/math_brute_force/sleep.h
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index f05c605aca..df6f98407a 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 #define CORRECTLY_ROUNDED 0
 #define FLUSHED 1
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 0a00772df9..4a1f37d971 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 #if defined(__APPLE__)
 #include <sys/time.h>
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index 28fbf19428..94ebb139f7 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index 2eb2ef3a1c..a59563c0aa 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -13,11 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <limits.h>
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 26ceb683ce..bac5fb4e4a 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
+#include "function_list.h"
+#include "utility.h"
 
 #include <string.h>
-#include "FunctionList.h"
 
 int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/Utility.cpp b/test_conformance/math_brute_force/utility.cpp
similarity index 98%
rename from test_conformance/math_brute_force/Utility.cpp
rename to test_conformance/math_brute_force/utility.cpp
index 3d8d9baab2..e47cdb25df 100644
--- a/test_conformance/math_brute_force/Utility.cpp
+++ b/test_conformance/math_brute_force/utility.cpp
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "Utility.h"
-#include "FunctionList.h"
+#include "utility.h"
+#include "function_list.h"
 
 #if defined(__PPC__)
 // Global varaiable used to hold the FPU control register state. The FPSCR
@@ -188,4 +188,4 @@ float getAllowedUlpError(const Func *f, const bool relaxed)
     }
 
     return ulp;
-}
\ No newline at end of file
+}
diff --git a/test_conformance/math_brute_force/Utility.h b/test_conformance/math_brute_force/utility.h
similarity index 100%
rename from test_conformance/math_brute_force/Utility.h
rename to test_conformance/math_brute_force/utility.h
diff --git a/test_conformance/spir/CMakeLists.txt b/test_conformance/spir/CMakeLists.txt
index 70effa15a4..f65c03139f 100644
--- a/test_conformance/spir/CMakeLists.txt
+++ b/test_conformance/spir/CMakeLists.txt
@@ -9,7 +9,7 @@ set (SPIR_SOURCES
     run_build_test.cpp
     run_services.cpp
     kernelargs.cpp
-    ../math_brute_force/FunctionList.cpp
+    ../math_brute_force/function_list.cpp
 )
 
 add_executable(${SPIR_OUT}
diff --git a/test_conformance/spir/run_build_test.cpp b/test_conformance/spir/run_build_test.cpp
index cec2d27552..9264d3a48a 100644
--- a/test_conformance/spir/run_build_test.cpp
+++ b/test_conformance/spir/run_build_test.cpp
@@ -33,12 +33,12 @@
 #include "harness/clImageHelper.h"
 #include "harness/os_helpers.h"
 
+#include "../math_brute_force/function_list.h"
+#include "datagen.h"
 #include "exceptions.h"
 #include "kernelargs.h"
-#include "datagen.h"
-#include "run_services.h"
 #include "run_build_test.h"
-#include "../math_brute_force/FunctionList.h"
+#include "run_services.h"
 #include <CL/cl.h>
 //
 // Task
diff --git a/test_conformance/spirv_new/CMakeLists.txt b/test_conformance/spirv_new/CMakeLists.txt
index 614d5a79fa..7500571d07 100644
--- a/test_conformance/spirv_new/CMakeLists.txt
+++ b/test_conformance/spirv_new/CMakeLists.txt
@@ -18,8 +18,8 @@ file(GLOB SPIRV_NEW_SOURCES "*.cpp")
 
 set(TEST_HARNESS_SOURCES
   ../../test_conformance/math_brute_force/reference_math.cpp
-  ../../test_conformance/math_brute_force/Utility.cpp
-  )
+  ../../test_conformance/math_brute_force/utility.cpp
+)
 
 set(${MODULE_NAME}_SOURCES ${SPIRV_NEW_SOURCES} ${TEST_HARNESS_SOURCES})
 

From 70b2492f30b44cadd6581a8f9fceb5602a8a6e9d Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 18 Feb 2021 10:06:37 +0000
Subject: [PATCH 046/158] Regroup vtbl definitions to one translation unit
 (#1167)

Move function declarations to a new header file, with relevant existing
documentation.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary.cpp  |  17 +--
 .../math_brute_force/binary_i.cpp             |   9 +-
 .../math_brute_force/binary_operator.cpp      |  11 +-
 .../math_brute_force/binary_two_results_i.cpp |  10 +-
 .../math_brute_force/function_list.cpp        | 102 ++++++++++++---
 test_conformance/math_brute_force/i_unary.cpp |  10 +-
 .../math_brute_force/macro_binary.cpp         |   9 +-
 .../math_brute_force/macro_unary.cpp          |   9 +-
 test_conformance/math_brute_force/mad.cpp     |  10 +-
 test_conformance/math_brute_force/main.cpp    |   1 +
 .../math_brute_force/reference_math.cpp       |   1 +
 test_conformance/math_brute_force/sleep.cpp   |   1 +
 test_conformance/math_brute_force/ternary.cpp |   9 +-
 .../math_brute_force/test_functions.h         | 118 ++++++++++++++++++
 test_conformance/math_brute_force/unary.cpp   |   8 +-
 .../math_brute_force/unary_two_results.cpp    |   9 +-
 .../math_brute_force/unary_two_results_i.cpp  |   9 +-
 test_conformance/math_brute_force/unary_u.cpp |   8 +-
 test_conformance/math_brute_force/utility.cpp |   1 +
 19 files changed, 234 insertions(+), 118 deletions(-)
 create mode 100644 test_conformance/math_brute_force/test_functions.h

diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index 343482cc36..699c09442d 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -13,26 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata,
-                                         bool relaxedMode);
-int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata,
-                                            bool relaxedMode);
-
-extern const vtbl _binary = { "binary", TestFunc_Float_Float_Float,
-                              TestFunc_Double_Double_Double };
-
-extern const vtbl _binary_nextafter = {
-    "binary_nextafter", TestFunc_Float_Float_Float_nextafter,
-    TestFunc_Double_Double_Double_nextafter
-};
-
 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index 5929f1d9c7..50d14f33ce 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -13,19 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <limits.h>
 #include <string.h>
 
-int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Double_Double_Int(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _binary_i = { "binary_i", TestFunc_Float_Float_Int,
-                                TestFunc_Double_Double_Int };
-
-
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/binary_operator.cpp b/test_conformance/math_brute_force/binary_operator.cpp
index 56cb8eb80b..65756901f6 100644
--- a/test_conformance/math_brute_force/binary_operator.cpp
+++ b/test_conformance/math_brute_force/binary_operator.cpp
@@ -13,20 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
-                                        bool relaxedMode);
-int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata,
-                                           bool relaxedMode);
-
-extern const vtbl _binary_operator = { "binaryOperator",
-                                       TestFunc_Float_Float_Float_Operator,
-                                       TestFunc_Double_Double_Double_Operator };
-
 static int BuildKernel(const char *name, const char *operator_symbol,
                        int vectorSize, cl_uint kernel_count, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index 08ac3b4aa4..ec00351a12 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -13,20 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <limits.h>
 #include <string.h>
 
-int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _binary_two_results_i = { "binary_two_results_i",
-                                            TestFunc_FloatI_Float_Float,
-                                            TestFunc_DoubleI_Double_Double };
-
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 2076aa689b..ef72a085b6 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -13,8 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
 #include "reference_math.h"
+#include "test_functions.h"
 
 #define FTZ_ON 1
 #define FTZ_OFF 0
@@ -102,23 +104,89 @@
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
 
-extern const vtbl _unary; // float foo( float )
-extern const vtbl _unary_u; // float foo( uint ),  double foo( ulong )
-extern const vtbl _i_unary; // int foo( float )
-extern const vtbl _macro_unary; // int foo( float ),  returns {0,1} for scalar,
-                                // { 0, -1 } for vector
-extern const vtbl _binary; // float foo( float, float )
-extern const vtbl _binary_nextafter; // float foo( float, float ), special
-                                     // handling for nextafter
-extern const vtbl _binary_operator; // float .op. float
-extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for
-                                 // scalar, { 0, -1 } for vector
-extern const vtbl _binary_i; // float foo( float, int )
-extern const vtbl _ternary; // float foo( float, float, float )
-extern const vtbl _unary_two_results; // float foo( float, float * )
-extern const vtbl _unary_two_results_i; // float foo( float, int * )
-extern const vtbl _binary_two_results_i; // float foo( float, float, int * )
-extern const vtbl _mad_tbl; // float mad( float, float, float )
+static constexpr vtbl _unary = {
+    "unary",
+    TestFunc_Float_Float,
+    TestFunc_Double_Double,
+};
+
+static constexpr vtbl _i_unary = {
+    "i_unary",
+    TestFunc_Int_Float,
+    TestFunc_Int_Double,
+};
+
+static constexpr vtbl _unary_u = {
+    "unary_u",
+    TestFunc_Float_UInt,
+    TestFunc_Double_ULong,
+};
+
+static constexpr vtbl _macro_unary = {
+    "macro_unary",
+    TestMacro_Int_Float,
+    TestMacro_Int_Double,
+};
+
+static constexpr vtbl _binary = {
+    "binary",
+    TestFunc_Float_Float_Float,
+    TestFunc_Double_Double_Double,
+};
+
+static constexpr vtbl _binary_nextafter = {
+    "binary_nextafter",
+    TestFunc_Float_Float_Float_nextafter,
+    TestFunc_Double_Double_Double_nextafter,
+};
+
+static constexpr vtbl _binary_operator = {
+    "binaryOperator",
+    TestFunc_Float_Float_Float_Operator,
+    TestFunc_Double_Double_Double_Operator,
+};
+
+static constexpr vtbl _binary_i = {
+    "binary_i",
+    TestFunc_Float_Float_Int,
+    TestFunc_Double_Double_Int,
+};
+
+static constexpr vtbl _macro_binary = {
+    "macro_binary",
+    TestMacro_Int_Float_Float,
+    TestMacro_Int_Double_Double,
+};
+
+static constexpr vtbl _ternary = {
+    "ternary",
+    TestFunc_Float_Float_Float_Float,
+    TestFunc_Double_Double_Double_Double,
+};
+
+static constexpr vtbl _unary_two_results = {
+    "unary_two_results",
+    TestFunc_Float2_Float,
+    TestFunc_Double2_Double,
+};
+
+static constexpr vtbl _unary_two_results_i = {
+    "unary_two_results_i",
+    TestFunc_FloatI_Float,
+    TestFunc_DoubleI_Double,
+};
+
+static constexpr vtbl _binary_two_results_i = {
+    "binary_two_results_i",
+    TestFunc_FloatI_Float_Float,
+    TestFunc_DoubleI_Double_Double,
+};
+
+static constexpr vtbl _mad_tbl = {
+    "ternary",
+    TestFunc_mad_Float,
+    TestFunc_mad_Double,
+};
 
 #define unaryF &_unary
 #define i_unaryF &_i_unary
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index 038e769f1c..9418d44def 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -13,19 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Int_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _i_unary = { "i_unary", TestFunc_Int_Float,
-                               TestFunc_Int_Double };
-
-
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index 09e0c6eb8d..fb88e6072b 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -13,18 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestMacro_Int_Float_Float(const Func *f, MTdata, bool relaxedMode);
-int TestMacro_Int_Double_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float,
-                                    TestMacro_Int_Double_Double };
-
-
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index 34827c032c..e5aa9e70bf 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -13,18 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode);
-int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _macro_unary = { "macro_unary", TestMacro_Int_Float,
-                                   TestMacro_Int_Double };
-
-
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index b0247b94fa..0d8c6d44df 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -13,17 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestFunc_mad(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _mad_tbl = { "ternary", TestFunc_mad, TestFunc_mad_Double };
-
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
@@ -235,7 +231,7 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
+int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
     uint32_t j, k;
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 5074eb6289..4be2020fc0 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
 #include "sleep.h"
 #include "utility.h"
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index cd1967dbe0..d7ad4c7f99 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "harness/compat.h"
 #include "reference_math.h"
 #include <limits.h>
diff --git a/test_conformance/math_brute_force/sleep.cpp b/test_conformance/math_brute_force/sleep.cpp
index 2669381448..c7b1243d4f 100644
--- a/test_conformance/math_brute_force/sleep.cpp
+++ b/test_conformance/math_brute_force/sleep.cpp
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "sleep.h"
 #include "utility.h"
 
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index df6f98407a..f8908909b5 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -13,7 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
@@ -21,13 +23,6 @@
 #define CORRECTLY_ROUNDED 0
 #define FLUSHED 1
 
-int TestFunc_Float_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Double_Double_Double_Double(const Func *f, MTdata,
-                                         bool relaxedMode);
-
-extern const vtbl _ternary = { "ternary", TestFunc_Float_Float_Float_Float,
-                               TestFunc_Double_Double_Double_Double };
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h
new file mode 100644
index 0000000000..78aef9c9a6
--- /dev/null
+++ b/test_conformance/math_brute_force/test_functions.h
@@ -0,0 +1,118 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TEST_FUNCTIONS_H
+#define TEST_FUNCTIONS_H
+
+#include "function_list.h"
+
+// float foo(float)
+int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(double)
+int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode);
+
+// int foo(float)
+int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode);
+
+// int foo(double)
+int TestFunc_Int_Double(const Func *f, MTdata, bool relaxedMode);
+
+// float foo(uint)
+int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(ulong)
+int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode);
+
+// Returns {0, 1} for scalar and {0, -1} for vector.
+// int foo(float)
+int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode);
+
+// Returns {0, 1} for scalar and {0, -1} for vector.
+// int foo(double)
+int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode);
+
+// float foo(float, float)
+int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(double, double)
+int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode);
+
+// Special handling for nextafter.
+// float foo(float, float)
+int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata,
+                                         bool relaxedMode);
+
+// Special handling for nextafter.
+// double foo(double, double)
+int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata,
+                                            bool relaxedMode);
+
+// float op float
+int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
+                                        bool relaxedMode);
+
+// double op double
+int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata,
+                                           bool relaxedMode);
+
+// float foo(float, int)
+int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(double, int)
+int TestFunc_Double_Double_Int(const Func *f, MTdata, bool relaxedMode);
+
+// Returns {0, 1} for scalar and {0, -1} for vector.
+// int foo(float, float)
+int TestMacro_Int_Float_Float(const Func *f, MTdata, bool relaxedMode);
+
+// Returns {0, 1} for scalar and {0, -1} for vector.
+// int foo(double, double)
+int TestMacro_Int_Double_Double(const Func *f, MTdata, bool relaxedMode);
+
+// float foo(float, float, float)
+int TestFunc_Float_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(double, double, double)
+int TestFunc_Double_Double_Double_Double(const Func *f, MTdata,
+                                         bool relaxedMode);
+
+// float foo(float, float*)
+int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(double, double*)
+int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode);
+
+// float foo(float, int*)
+int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(double, int*)
+int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode);
+
+// float foo(float, float, int*)
+int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
+
+// double foo(double, double, int*)
+int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode);
+
+// Special handling for mad.
+// float mad(float, float, float)
+int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode);
+
+// Special handling for mad.
+// double mad(double, double, double)
+int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode);
+
+#endif
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 4a1f37d971..dc6d56c1c6 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -13,7 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
@@ -22,12 +24,6 @@
 #include <sys/time.h>
 #endif
 
-int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _unary = { "unary", TestFunc_Float_Float,
-                             TestFunc_Double_Double };
-
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index 94ebb139f7..accebd3a4e 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -13,18 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _unary_two_results = { "unary_two_results",
-                                         TestFunc_Float2_Float,
-                                         TestFunc_Double2_Double };
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index a59563c0aa..2ac083d2f2 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -13,19 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <limits.h>
 #include <string.h>
 
-int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _unary_two_results_i = { "unary_two_results_i",
-                                           TestFunc_FloatI_Float,
-                                           TestFunc_DoubleI_Double };
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index bac5fb4e4a..3b8f1f6908 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -13,17 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "function_list.h"
+#include "test_functions.h"
 #include "utility.h"
 
 #include <string.h>
 
-int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode);
-int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode);
-
-extern const vtbl _unary_u = { "unary_u", TestFunc_Float_UInt,
-                               TestFunc_Double_ULong };
-
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/utility.cpp b/test_conformance/math_brute_force/utility.cpp
index e47cdb25df..9b0191ab36 100644
--- a/test_conformance/math_brute_force/utility.cpp
+++ b/test_conformance/math_brute_force/utility.cpp
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
 #include "utility.h"
 #include "function_list.h"
 

From fc67d7b28f8fd5fec8fb0a9196c0b1baa537f5f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kevin.petit@arm.com>
Date: Thu, 18 Feb 2021 10:06:56 +0000
Subject: [PATCH 047/158] Move media_sharing tests to
 test_conformance/extensions/ (#1164)

* Move media_sharing tests to test_conformance/extensions/

And rename to cl_khr_dx9_media_sharing.

Signed-off-by: Kevin Petit <kevin.petit@arm.com>

* format code

* more format changes
---
 CMakeLists.txt                                |    1 -
 test_conformance/extensions/CMakeLists.txt    |    1 +
 .../cl_khr_dx9_media_sharing}/CMakeLists.txt  |    2 +-
 .../cl_khr_dx9_media_sharing/main.cpp         |  231 +++
 .../cl_khr_dx9_media_sharing}/procs.h         |   23 +-
 .../test_create_context.cpp                   |  373 ++++
 .../test_functions_api.cpp                    |  781 ++++++++
 .../test_functions_kernel.cpp                 |  541 ++++++
 .../test_get_device_ids.cpp                   |  220 +++
 .../test_interop_sync.cpp                     |  419 +++++
 .../test_memory_access.cpp                    |  549 ++++++
 .../test_other_data_types.cpp                 | 1319 +++++++++++++
 .../cl_khr_dx9_media_sharing/utils.cpp        | 1664 +++++++++++++++++
 .../cl_khr_dx9_media_sharing/utils.h          |  215 +++
 .../cl_khr_dx9_media_sharing/wrappers.cpp     |  463 +++++
 .../cl_khr_dx9_media_sharing/wrappers.h       |  195 ++
 test_extensions/CMakeLists.txt                |    2 -
 test_extensions/media_sharing/main.cpp        |  204 --
 .../media_sharing/test_create_context.cpp     |  318 ----
 .../media_sharing/test_functions_api.cpp      |  617 ------
 .../media_sharing/test_functions_kernel.cpp   |  446 -----
 .../media_sharing/test_get_device_ids.cpp     |  196 --
 .../media_sharing/test_interop_sync.cpp       |  357 ----
 .../media_sharing/test_memory_access.cpp      |  468 -----
 .../media_sharing/test_other_data_types.cpp   | 1023 ----------
 test_extensions/media_sharing/utils.cpp       | 1595 ----------------
 test_extensions/media_sharing/utils.h         |  167 --
 test_extensions/media_sharing/wrappers.cpp    |  562 ------
 test_extensions/media_sharing/wrappers.h      |  197 --
 29 files changed, 6987 insertions(+), 6162 deletions(-)
 rename {test_extensions/media_sharing => test_conformance/extensions/cl_khr_dx9_media_sharing}/CMakeLists.txt (92%)
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/main.cpp
 rename {test_extensions/media_sharing => test_conformance/extensions/cl_khr_dx9_media_sharing}/procs.h (61%)
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/test_create_context.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_api.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_kernel.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/test_get_device_ids.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/test_interop_sync.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/test_memory_access.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/test_other_data_types.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/utils.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/utils.h
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.cpp
 create mode 100644 test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.h
 delete mode 100644 test_extensions/CMakeLists.txt
 delete mode 100644 test_extensions/media_sharing/main.cpp
 delete mode 100644 test_extensions/media_sharing/test_create_context.cpp
 delete mode 100644 test_extensions/media_sharing/test_functions_api.cpp
 delete mode 100644 test_extensions/media_sharing/test_functions_kernel.cpp
 delete mode 100644 test_extensions/media_sharing/test_get_device_ids.cpp
 delete mode 100644 test_extensions/media_sharing/test_interop_sync.cpp
 delete mode 100644 test_extensions/media_sharing/test_memory_access.cpp
 delete mode 100644 test_extensions/media_sharing/test_other_data_types.cpp
 delete mode 100644 test_extensions/media_sharing/utils.cpp
 delete mode 100644 test_extensions/media_sharing/utils.h
 delete mode 100644 test_extensions/media_sharing/wrappers.cpp
 delete mode 100644 test_extensions/media_sharing/wrappers.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 844283aaaa..5d32692c9d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,7 +223,6 @@ endif(CMAKE_BUILD_TYPE STREQUAL "release")
 
 add_subdirectory(test_common)
 add_subdirectory(test_conformance)
-add_subdirectory(test_extensions)
 
 # Support both VS2008 and VS2012.
 set (DLL_FILES "${VS_BUILD_DIR}/Debug/*.dll")
diff --git a/test_conformance/extensions/CMakeLists.txt b/test_conformance/extensions/CMakeLists.txt
index c917eb3060..53d77ee55d 100644
--- a/test_conformance/extensions/CMakeLists.txt
+++ b/test_conformance/extensions/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory( cl_ext_cxx_for_opencl )
+add_subdirectory( cl_khr_dx9_media_sharing )
diff --git a/test_extensions/media_sharing/CMakeLists.txt b/test_conformance/extensions/cl_khr_dx9_media_sharing/CMakeLists.txt
similarity index 92%
rename from test_extensions/media_sharing/CMakeLists.txt
rename to test_conformance/extensions/cl_khr_dx9_media_sharing/CMakeLists.txt
index 9fdde1c7e6..1ec2a33895 100644
--- a/test_extensions/media_sharing/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/CMakeLists.txt
@@ -21,5 +21,5 @@ set_source_files_properties(
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
-include(../../test_conformance/CMakeCommon.txt)
+include(../../CMakeCommon.txt)
 endif(WIN32)
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/main.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/main.cpp
new file mode 100644
index 0000000000..8b70917316
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/main.cpp
@@ -0,0 +1,231 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "harness/testHarness.h"
+#include "utils.h"
+#include "procs.h"
+
+
+test_definition test_list[] = { ADD_TEST(context_create),
+                                ADD_TEST(get_device_ids),
+                                ADD_TEST(api),
+                                ADD_TEST(kernel),
+                                ADD_TEST(other_data_types),
+                                ADD_TEST(memory_access),
+                                ADD_TEST(interop_user_sync) };
+
+const int test_num = ARRAY_SIZE(test_list);
+
+clGetDeviceIDsFromDX9MediaAdapterKHR_fn clGetDeviceIDsFromDX9MediaAdapterKHR =
+    NULL;
+clCreateFromDX9MediaSurfaceKHR_fn clCreateFromDX9MediaSurfaceKHR = NULL;
+clEnqueueAcquireDX9MediaSurfacesKHR_fn clEnqueueAcquireDX9MediaSurfacesKHR =
+    NULL;
+clEnqueueReleaseDX9MediaSurfacesKHR_fn clEnqueueReleaseDX9MediaSurfacesKHR =
+    NULL;
+
+cl_platform_id gPlatformIDdetected;
+cl_device_id gDeviceIDdetected;
+cl_device_type gDeviceTypeSelected = CL_DEVICE_TYPE_DEFAULT;
+
+bool MediaSurfaceSharingExtensionInit()
+{
+    clGetDeviceIDsFromDX9MediaAdapterKHR =
+        (clGetDeviceIDsFromDX9MediaAdapterKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(
+                gPlatformIDdetected, "clGetDeviceIDsFromDX9MediaAdapterKHR");
+    if (clGetDeviceIDsFromDX9MediaAdapterKHR == NULL)
+    {
+        log_error("clGetExtensionFunctionAddressForPlatform("
+                  "clGetDeviceIDsFromDX9MediaAdapterKHR) returned NULL.\n");
+        return false;
+    }
+
+    clCreateFromDX9MediaSurfaceKHR = (clCreateFromDX9MediaSurfaceKHR_fn)
+        clGetExtensionFunctionAddressForPlatform(
+            gPlatformIDdetected, "clCreateFromDX9MediaSurfaceKHR");
+    if (clCreateFromDX9MediaSurfaceKHR == NULL)
+    {
+        log_error("clGetExtensionFunctionAddressForPlatform("
+                  "clCreateFromDX9MediaSurfaceKHR) returned NULL.\n");
+        return false;
+    }
+
+    clEnqueueAcquireDX9MediaSurfacesKHR =
+        (clEnqueueAcquireDX9MediaSurfacesKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(
+                gPlatformIDdetected, "clEnqueueAcquireDX9MediaSurfacesKHR");
+    if (clEnqueueAcquireDX9MediaSurfacesKHR == NULL)
+    {
+        log_error("clGetExtensionFunctionAddressForPlatform("
+                  "clEnqueueAcquireDX9MediaSurfacesKHR) returned NULL.\n");
+        return false;
+    }
+
+    clEnqueueReleaseDX9MediaSurfacesKHR =
+        (clEnqueueReleaseDX9MediaSurfacesKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(
+                gPlatformIDdetected, "clEnqueueReleaseDX9MediaSurfacesKHR");
+    if (clEnqueueReleaseDX9MediaSurfacesKHR == NULL)
+    {
+        log_error("clGetExtensionFunctionAddressForPlatform("
+                  "clEnqueueReleaseDX9MediaSurfacesKHR) returned NULL.\n");
+        return false;
+    }
+
+    return true;
+}
+
+bool DetectPlatformAndDevice()
+{
+    std::vector<cl_platform_id> platforms;
+    cl_uint platformsNum = 0;
+    cl_int error = clGetPlatformIDs(0, 0, &platformsNum);
+    if (error != CL_SUCCESS)
+    {
+        print_error(error, "clGetPlatformIDs failed\n");
+        return false;
+    }
+
+    platforms.resize(platformsNum);
+    error = clGetPlatformIDs(platformsNum, &platforms[0], 0);
+    if (error != CL_SUCCESS)
+    {
+        print_error(error, "clGetPlatformIDs failed\n");
+        return false;
+    }
+
+    bool found = false;
+    for (size_t i = 0; i < platformsNum; ++i)
+    {
+        std::vector<cl_device_id> devices;
+        cl_uint devicesNum = 0;
+        error = clGetDeviceIDs(platforms[i], gDeviceTypeSelected, 0, 0,
+                               &devicesNum);
+        if (error != CL_SUCCESS)
+        {
+            print_error(error, "clGetDeviceIDs failed\n");
+            return false;
+        }
+
+        devices.resize(devicesNum);
+        error = clGetDeviceIDs(platforms[i], gDeviceTypeSelected, devicesNum,
+                               &devices[0], 0);
+        if (error != CL_SUCCESS)
+        {
+            print_error(error, "clGetDeviceIDs failed\n");
+            return false;
+        }
+
+        for (size_t j = 0; j < devicesNum; ++j)
+        {
+            if (is_extension_available(devices[j], "cl_khr_dx9_media_sharing"))
+            {
+                gPlatformIDdetected = platforms[i];
+                gDeviceIDdetected = devices[j];
+                found = true;
+                break;
+            }
+        }
+    }
+
+    if (!found)
+    {
+        log_info("Test was not run, because the media surface sharing "
+                 "extension is not supported for any devices.\n");
+        return false;
+    }
+
+    return true;
+}
+
+bool CmdlineParse(int argc, const char *argv[])
+{
+    char *env_mode = getenv("CL_DEVICE_TYPE");
+    if (env_mode != NULL)
+    {
+        if (strcmp(env_mode, "gpu") == 0
+            || strcmp(env_mode, "CL_DEVICE_TYPE_GPU") == 0)
+            gDeviceTypeSelected = CL_DEVICE_TYPE_GPU;
+        else if (strcmp(env_mode, "cpu") == 0
+                 || strcmp(env_mode, "CL_DEVICE_TYPE_CPU") == 0)
+            gDeviceTypeSelected = CL_DEVICE_TYPE_CPU;
+        else if (strcmp(env_mode, "accelerator") == 0
+                 || strcmp(env_mode, "CL_DEVICE_TYPE_ACCELERATOR") == 0)
+            gDeviceTypeSelected = CL_DEVICE_TYPE_ACCELERATOR;
+        else if (strcmp(env_mode, "default") == 0
+                 || strcmp(env_mode, "CL_DEVICE_TYPE_DEFAULT") == 0)
+            gDeviceTypeSelected = CL_DEVICE_TYPE_DEFAULT;
+        else
+        {
+            log_error("Unknown CL_DEVICE_TYPE env variable setting: "
+                      "%s.\nAborting...\n",
+                      env_mode);
+            return false;
+        }
+    }
+
+    for (int i = 0; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "gpu") == 0
+            || strcmp(argv[i], "CL_DEVICE_TYPE_GPU") == 0)
+        {
+            gDeviceTypeSelected = CL_DEVICE_TYPE_GPU;
+            continue;
+        }
+        else if (strcmp(argv[i], "cpu") == 0
+                 || strcmp(argv[i], "CL_DEVICE_TYPE_CPU") == 0)
+        {
+            gDeviceTypeSelected = CL_DEVICE_TYPE_CPU;
+            continue;
+        }
+        else if (strcmp(argv[i], "accelerator") == 0
+                 || strcmp(argv[i], "CL_DEVICE_TYPE_ACCELERATOR") == 0)
+        {
+            gDeviceTypeSelected = CL_DEVICE_TYPE_ACCELERATOR;
+            continue;
+        }
+        else if (strcmp(argv[i], "CL_DEVICE_TYPE_DEFAULT") == 0)
+        {
+            gDeviceTypeSelected = CL_DEVICE_TYPE_DEFAULT;
+            continue;
+        }
+        else if (strcmp(argv[i], "sw") == 0 || strcmp(argv[i], "software") == 0)
+        {
+            CDeviceWrapper::AccelerationType(CDeviceWrapper::ACCELERATION_SW);
+        }
+    }
+
+    return true;
+}
+
+int main(int argc, const char *argv[])
+{
+    if (!CmdlineParse(argc, argv)) return TEST_FAIL;
+
+    if (!DetectPlatformAndDevice())
+    {
+        log_info("Test was not run, because the media surface sharing "
+                 "extension is not supported\n");
+        return TEST_SKIP;
+    }
+
+    if (!MediaSurfaceSharingExtensionInit()) return TEST_FAIL;
+
+    return runTestHarness(argc, argv, test_num, test_list, true, 0);
+}
diff --git a/test_extensions/media_sharing/procs.h b/test_conformance/extensions/cl_khr_dx9_media_sharing/procs.h
similarity index 61%
rename from test_extensions/media_sharing/procs.h
rename to test_conformance/extensions/cl_khr_dx9_media_sharing/procs.h
index 6b577990de..e7fd785e90 100644
--- a/test_extensions/media_sharing/procs.h
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/procs.h
@@ -19,13 +19,20 @@
 #define __MEDIA_SHARING_PROCS_H__
 
 
-extern int test_context_create(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_get_device_ids(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_other_data_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_memory_access(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_interop_user_sync(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_context_create(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements);
+extern int test_get_device_ids(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements);
+extern int test_api(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements);
+extern int test_kernel(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements);
+extern int test_other_data_types(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements);
+extern int test_memory_access(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements);
+extern int test_interop_user_sync(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements);
 
 
-#endif    // #ifndef __MEDIA_SHARING_PROCS_H__ 
\ No newline at end of file
+#endif // #ifndef __MEDIA_SHARING_PROCS_H__
\ No newline at end of file
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/test_create_context.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_create_context.cpp
new file mode 100644
index 0000000000..6033ce9bdb
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_create_context.cpp
@@ -0,0 +1,373 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "utils.h"
+
+int context_create(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements, unsigned int width,
+                   unsigned int height, TContextFuncType functionCreate,
+                   cl_dx9_media_adapter_type_khr adapterType,
+                   TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
+{
+    CResult result;
+
+    // create device
+    std::auto_ptr<CDeviceWrapper> deviceWrapper;
+    if (!DeviceCreate(adapterType, deviceWrapper))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    // generate input data
+    std::vector<cl_uchar> bufferIn(width * height * 3 / 2, 0);
+    if (!YUVGenerate(surfaceFormat, bufferIn, width, height, 0, 255))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    while (deviceWrapper->AdapterNext())
+    {
+        cl_int error;
+        // check if the test can be run on the adapter
+        if (CL_SUCCESS
+            != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType,
+                                             deviceWrapper->Device(), result,
+                                             sharedHandle)))
+        {
+            return result.Result();
+        }
+
+        if (surfaceFormat != SURFACE_FORMAT_NV12
+            && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
+        {
+            std::string sharedHandleStr =
+                (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+            std::string formatStr;
+            std::string adapterStr;
+            SurfaceFormatToString(surfaceFormat, formatStr);
+            AdapterToString(adapterType, adapterStr);
+            log_info(
+                "Skipping test case, image format is not supported by a device "
+                "(adapter type: %s, format: %s, shared handle: %s)\n",
+                adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
+            return result.Result();
+        }
+
+        void *objectSharedHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surface;
+        if (!MediaSurfaceCreate(
+                adapterType, width, height, surfaceFormat, *deviceWrapper,
+                surface, (sharedHandle == SHARED_HANDLE_ENABLED) ? true : false,
+                &objectSharedHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        cl_context_properties contextProperties[] = {
+            CL_CONTEXT_PLATFORM,
+            (cl_context_properties)gPlatformIDdetected,
+            AdapterTypeToContextInfo(adapterType),
+            (cl_context_properties)deviceWrapper->Device(),
+            0,
+        };
+
+        clContextWrapper ctx;
+        switch (functionCreate)
+        {
+            case CONTEXT_CREATE_DEFAULT:
+                ctx = clCreateContext(&contextProperties[0], 1,
+                                      &gDeviceIDdetected, NULL, NULL, &error);
+                break;
+            case CONTEXT_CREATE_FROM_TYPE:
+                ctx = clCreateContextFromType(&contextProperties[0],
+                                              gDeviceTypeSelected, NULL, NULL,
+                                              &error);
+                break;
+            default:
+                log_error("Unknown context creation function enum\n");
+                result.ResultSub(CResult::TEST_ERROR);
+                return result.Result();
+                break;
+        }
+
+        if (error != CL_SUCCESS)
+        {
+            std::string functionName;
+            FunctionContextCreateToString(functionCreate, functionName);
+            log_error("%s failed: %s\n", functionName.c_str(),
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!YUVSurfaceSet(surfaceFormat, surface, bufferIn, width, height))
+        {
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+#if defined(_WIN32)
+        cl_dx9_surface_info_khr surfaceInfo;
+        surfaceInfo.resource =
+            *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
+        surfaceInfo.shared_handle = objectSharedHandle;
+#else
+        void *surfaceInfo = 0;
+        return TEST_NOT_IMPLEMENTED;
+#endif
+
+        std::vector<cl_mem> memObjList;
+        unsigned int planesNum = PlanesNum(surfaceFormat);
+        std::vector<clMemWrapper> planesList(planesNum);
+        for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
+        {
+            planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx,
+                &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error(
+                    "clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n",
+                    planeIdx, IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+            memObjList.push_back(planesList[planeIdx]);
+        }
+
+        clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(
+            ctx, gDeviceIDdetected, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("Unable to create command queue: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!ImageInfoVerify(adapterType, memObjList, width, height, surface,
+                             objectSharedHandle))
+        {
+            log_error("Image info verification failed\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        cl_event event;
+        error = clEnqueueAcquireDX9MediaSurfacesKHR(
+            cmdQueue, static_cast<cl_uint>(memObjList.size()),
+            &memObjList.at(0), 0, NULL, &event);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        cl_uint eventType = 0;
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(eventType),
+                               &eventType, NULL);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clGetEventInfo failed: %s\n", IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        if (eventType != CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR)
+        {
+            log_error(
+                "Invalid event != CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        clReleaseEvent(event);
+
+        size_t origin[3] = { 0, 0, 0 };
+        size_t offset = 0;
+        size_t frameSize = width * height * 3 / 2;
+        std::vector<cl_uchar> out(frameSize, 0);
+        for (size_t i = 0; i < memObjList.size(); ++i)
+        {
+            size_t planeWidth = (i == 0) ? width : width / 2;
+            size_t planeHeight = (i == 0) ? height : height / 2;
+            size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+            error =
+                clEnqueueReadImage(cmdQueue, memObjList.at(i), CL_TRUE, origin,
+                                   regionPlane, 0, 0, &out.at(offset), 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReadImage failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            offset += planeWidth * planeHeight;
+        }
+
+        if (!YUVCompare(surfaceFormat, out, bufferIn, width, height))
+        {
+            log_error("OCL object verification failed - clEnqueueReadImage\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        error = clEnqueueReleaseDX9MediaSurfacesKHR(
+            cmdQueue, static_cast<cl_uint>(memObjList.size()),
+            &memObjList.at(0), 0, NULL, &event);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        eventType = 0;
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(eventType),
+                               &eventType, NULL);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clGetEventInfo failed: %s\n", IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        if (eventType != CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR)
+        {
+            log_error(
+                "Invalid event != CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        clReleaseEvent(event);
+
+        // object verification
+        std::vector<cl_uchar> bufferOut(frameSize, 0);
+        if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut, width, height))
+        {
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!YUVCompare(surfaceFormat, bufferOut, bufferIn, width, height))
+        {
+            log_error("Media surface is different than expected\n");
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+    }
+
+    if (deviceWrapper->Status() != DEVICE_PASS)
+    {
+        std::string adapterName;
+        AdapterToString(adapterType, adapterName);
+        if (deviceWrapper->Status() == DEVICE_FAIL)
+        {
+            log_error("%s init failed\n", adapterName.c_str());
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+        else
+        {
+            log_error("%s init incomplete due to unsupported device\n",
+                      adapterName.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return result.Result();
+}
+
+int test_context_create(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
+{
+    const unsigned int WIDTH = 256;
+    const unsigned int HEIGHT = 256;
+
+    std::vector<cl_dx9_media_adapter_type_khr> adapterTypes;
+#if defined(_WIN32)
+    adapterTypes.push_back(CL_ADAPTER_D3D9_KHR);
+    adapterTypes.push_back(CL_ADAPTER_D3D9EX_KHR);
+    adapterTypes.push_back(CL_ADAPTER_DXVA_KHR);
+#endif
+
+    std::vector<TContextFuncType> contextFuncs;
+    contextFuncs.push_back(CONTEXT_CREATE_DEFAULT);
+    contextFuncs.push_back(CONTEXT_CREATE_FROM_TYPE);
+
+    std::vector<TSurfaceFormat> formats;
+    formats.push_back(SURFACE_FORMAT_NV12);
+    formats.push_back(SURFACE_FORMAT_YV12);
+
+    std::vector<TSharedHandleType> sharedHandleTypes;
+    sharedHandleTypes.push_back(SHARED_HANDLE_DISABLED);
+#if defined(_WIN32)
+    sharedHandleTypes.push_back(SHARED_HANDLE_ENABLED);
+#endif
+
+    CResult result;
+    for (size_t adapterTypeIdx = 0; adapterTypeIdx < adapterTypes.size();
+         ++adapterTypeIdx)
+    {
+        // iteration through all create context functions
+        for (size_t contextFuncIdx = 0; contextFuncIdx < contextFuncs.size();
+             ++contextFuncIdx)
+        {
+            // iteration through surface formats
+            for (size_t formatIdx = 0; formatIdx < formats.size(); ++formatIdx)
+            {
+                // shared handle enabled or disabled
+                for (size_t sharedHandleIdx = 0;
+                     sharedHandleIdx < sharedHandleTypes.size();
+                     ++sharedHandleIdx)
+                {
+                    if (adapterTypes[adapterTypeIdx] == CL_ADAPTER_D3D9_KHR
+                        && sharedHandleTypes[sharedHandleIdx]
+                            == SHARED_HANDLE_ENABLED)
+                        continue;
+
+                    if (context_create(
+                            deviceID, context, queue, num_elements, WIDTH,
+                            HEIGHT, contextFuncs[contextFuncIdx],
+                            adapterTypes[adapterTypeIdx], formats[formatIdx],
+                            sharedHandleTypes[sharedHandleIdx])
+                        != 0)
+                    {
+                        std::string sharedHandle =
+                            (sharedHandleTypes[sharedHandleIdx]
+                             == SHARED_HANDLE_ENABLED)
+                            ? "shared handle"
+                            : "no shared handle";
+                        std::string formatStr;
+                        std::string adapterTypeStr;
+                        SurfaceFormatToString(formats[formatIdx], formatStr);
+                        AdapterToString(adapterTypes[adapterTypeIdx],
+                                        adapterTypeStr);
+
+                        log_error("\nTest case - clCreateContext (%s, %s, %s) "
+                                  "failed\n\n",
+                                  adapterTypeStr.c_str(), formatStr.c_str(),
+                                  sharedHandle.c_str());
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+                }
+            }
+        }
+    }
+
+    return result.Result();
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_api.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_api.cpp
new file mode 100644
index 0000000000..ab92cb890a
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_api.cpp
@@ -0,0 +1,781 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "utils.h"
+
+int api_functions(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements,
+                  unsigned int iterationNum, unsigned int width,
+                  unsigned int height,
+                  cl_dx9_media_adapter_type_khr adapterType,
+                  TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
+{
+    const unsigned int FRAME_NUM = 2;
+    const cl_uchar MAX_VALUE = 255 / 2;
+    CResult result;
+
+    // create device
+    std::auto_ptr<CDeviceWrapper> deviceWrapper;
+    if (!DeviceCreate(adapterType, deviceWrapper))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    // generate input and expected data
+    std::vector<std::vector<cl_uchar>> bufferRef1(FRAME_NUM);
+    std::vector<std::vector<cl_uchar>> bufferRef2(FRAME_NUM);
+    std::vector<std::vector<cl_uchar>> bufferRef3(FRAME_NUM);
+    size_t frameSize = width * height * 3 / 2;
+    cl_uchar step = MAX_VALUE / FRAME_NUM;
+    for (size_t i = 0; i < FRAME_NUM; ++i)
+    {
+        if (!YUVGenerate(surfaceFormat, bufferRef1[i], width, height,
+                         static_cast<cl_uchar>(step * i),
+                         static_cast<cl_uchar>(step * (i + 1)))
+            || !YUVGenerate(surfaceFormat, bufferRef2[i], width, height,
+                            static_cast<cl_uchar>(step * i),
+                            static_cast<cl_uchar>(step * (i + 1)), 0.2)
+            || !YUVGenerate(surfaceFormat, bufferRef3[i], width, height,
+                            static_cast<cl_uchar>(step * i),
+                            static_cast<cl_uchar>(step * (i + 1)), 0.4))
+        {
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+    }
+
+    // iterates through all devices
+    while (deviceWrapper->AdapterNext())
+    {
+        cl_int error;
+        // check if the test can be run on the adapter
+        if (CL_SUCCESS
+            != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType,
+                                             deviceWrapper->Device(), result,
+                                             sharedHandle)))
+        {
+            return result.Result();
+        }
+
+        if (surfaceFormat != SURFACE_FORMAT_NV12
+            && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
+        {
+            std::string sharedHandleStr =
+                (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+            std::string formatStr;
+            std::string adapterStr;
+            SurfaceFormatToString(surfaceFormat, formatStr);
+            AdapterToString(adapterType, adapterStr);
+            log_info(
+                "Skipping test case, image format is not supported by a device "
+                "(adapter type: %s, format: %s, shared handle: %s)\n",
+                adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
+            return result.Result();
+        }
+
+        void *objectSharedHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surface;
+
+        // create surface
+        if (!MediaSurfaceCreate(
+                adapterType, width, height, surfaceFormat, *deviceWrapper,
+                surface, (sharedHandle == SHARED_HANDLE_ENABLED) ? true : false,
+                &objectSharedHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        cl_context_properties contextProperties[] = {
+            CL_CONTEXT_PLATFORM,
+            (cl_context_properties)gPlatformIDdetected,
+            AdapterTypeToContextInfo(adapterType),
+            (cl_context_properties)deviceWrapper->Device(),
+            0,
+        };
+
+        clContextWrapper ctx = clCreateContext(
+            &contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clCreateContext failed: %s\n", IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+#if defined(_WIN32)
+        cl_dx9_surface_info_khr surfaceInfo;
+        surfaceInfo.resource =
+            *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
+        surfaceInfo.shared_handle = objectSharedHandle;
+#else
+        void *surfaceInfo = 0;
+        return TEST_NOT_IMPLEMENTED;
+#endif
+
+        std::vector<cl_mem> memObjList;
+        unsigned int planesNum = PlanesNum(surfaceFormat);
+        std::vector<clMemWrapper> planesList(planesNum);
+        for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
+        {
+            planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx,
+                &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error(
+                    "clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n",
+                    planeIdx, IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+            memObjList.push_back(planesList[planeIdx]);
+        }
+
+        clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(
+            ctx, gDeviceIDdetected, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("Unable to create command queue: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!ImageInfoVerify(adapterType, memObjList, width, height, surface,
+                             objectSharedHandle))
+        {
+            log_error("Image info verification failed\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        for (size_t frameIdx = 0; frameIdx < iterationNum; ++frameIdx)
+        {
+            if (!YUVSurfaceSet(surfaceFormat, surface,
+                               bufferRef1[frameIdx % FRAME_NUM], width, height))
+            {
+                result.ResultSub(CResult::TEST_ERROR);
+                return result.Result();
+            }
+
+            error = clEnqueueAcquireDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            { // read operation
+                std::vector<cl_uchar> out(frameSize, 0);
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE,
+                                               origin, regionPlane, 0, 0,
+                                               &out[offset], 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueReadImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight;
+                }
+
+                if (!YUVCompare(surfaceFormat, out,
+                                bufferRef1[frameIdx % FRAME_NUM], width,
+                                height))
+                {
+                    log_error("Frame idx: %i, OCL image is different then "
+                              "shared OCL object: clEnqueueReadImage\n",
+                              frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // write operation
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    error = clEnqueueWriteImage(
+                        cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane,
+                        0, 0, &bufferRef2[frameIdx % FRAME_NUM][offset], 0, 0,
+                        0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueWriteImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight;
+                }
+            }
+
+            { // read operation
+                std::vector<cl_uchar> out(frameSize, 0);
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE,
+                                               origin, regionPlane, 0, 0,
+                                               &out[offset], 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueReadImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight;
+                }
+
+                if (!YUVCompare(surfaceFormat, out,
+                                bufferRef2[frameIdx % FRAME_NUM], width,
+                                height))
+                {
+                    log_error("Frame idx: %i, Shared OCL image verification "
+                              "after clEnqueueWriteImage failed\n",
+                              frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // copy operation (shared OCL to OCL)
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                std::vector<cl_uchar> out(frameSize, 0);
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    cl_image_format formatPlane;
+                    formatPlane.image_channel_data_type = CL_UNORM_INT8;
+                    formatPlane.image_channel_order =
+                        (surfaceFormat == SURFACE_FORMAT_NV12 && i > 0) ? CL_RG
+                                                                        : CL_R;
+
+                    cl_image_desc imageDesc = { 0 };
+                    imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
+                    imageDesc.image_width = planeWidth;
+                    imageDesc.image_height = planeHeight;
+
+                    clMemWrapper planeOCL =
+                        clCreateImage(ctx, CL_MEM_READ_WRITE, &formatPlane,
+                                      &imageDesc, 0, &error);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clCreateImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    error = clEnqueueCopyImage(cmdQueue, memObjList[i],
+                                               planeOCL, origin, origin,
+                                               regionPlane, 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueCopyImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    error = clEnqueueReadImage(cmdQueue, planeOCL, CL_TRUE,
+                                               origin, regionPlane, 0, 0,
+                                               &out[offset], 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueReadImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight;
+                }
+
+                if (!YUVCompare(surfaceFormat, out,
+                                bufferRef2[frameIdx % FRAME_NUM], width,
+                                height))
+                {
+                    log_error(
+                        "Frame idx: %i, OCL image verification after "
+                        "clEnqueueCopyImage (from shared OCL to OCL) failed\n",
+                        frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // copy operation (OCL to shared OCL)
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                std::vector<cl_uchar> out(frameSize, 0);
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+                    size_t pitchSize =
+                        ((surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)
+                             ? width
+                             : planeWidth)
+                        * sizeof(cl_uchar);
+
+                    cl_image_format formatPlane;
+                    formatPlane.image_channel_data_type = CL_UNORM_INT8;
+                    formatPlane.image_channel_order =
+                        (surfaceFormat == SURFACE_FORMAT_NV12 && i > 0) ? CL_RG
+                                                                        : CL_R;
+
+                    cl_image_desc imageDesc = { 0 };
+                    imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
+                    imageDesc.image_width = planeWidth;
+                    imageDesc.image_height = planeHeight;
+                    imageDesc.image_row_pitch = pitchSize;
+
+                    clMemWrapper planeOCL = clCreateImage(
+                        ctx, CL_MEM_COPY_HOST_PTR, &formatPlane, &imageDesc,
+                        &bufferRef1[frameIdx % FRAME_NUM][offset], &error);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clCreateImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    error = clEnqueueCopyImage(cmdQueue, planeOCL,
+                                               memObjList[i], origin, origin,
+                                               regionPlane, 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueCopyImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE,
+                                               origin, regionPlane, 0, 0,
+                                               &out[offset], 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueReadImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight;
+                }
+
+                if (!YUVCompare(surfaceFormat, out,
+                                bufferRef1[frameIdx % FRAME_NUM], width,
+                                height))
+                {
+                    log_error(
+                        "Frame idx: %i, OCL image verification after "
+                        "clEnqueueCopyImage (from OCL to shared OCL) failed\n",
+                        frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // copy from image to buffer
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                size_t bufferSize = sizeof(cl_uchar) * frameSize;
+                clMemWrapper buffer = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+                                                     bufferSize, NULL, &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clCreateBuffer failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    error = clEnqueueCopyImageToBuffer(
+                        cmdQueue, memObjList[i], buffer, origin, regionPlane,
+                        offset, 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueCopyImageToBuffer failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight * sizeof(cl_uchar);
+                }
+
+                std::vector<cl_uchar> out(frameSize, 0);
+                error = clEnqueueReadBuffer(cmdQueue, buffer, CL_TRUE, 0,
+                                            bufferSize, &out[0], 0, NULL, NULL);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to read buffer");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                if (!YUVCompare(surfaceFormat, out,
+                                bufferRef1[frameIdx % FRAME_NUM], width,
+                                height))
+                {
+                    log_error("Frame idx: %i, OCL buffer verification after "
+                              "clEnqueueCopyImageToBuffer (from shared OCL "
+                              "image to OCL buffer) failed\n",
+                              frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // copy buffer to image
+                size_t bufferSize = sizeof(cl_uchar) * frameSize;
+                clMemWrapper buffer = clCreateBuffer(
+                    ctx, CL_MEM_COPY_HOST_PTR, bufferSize,
+                    &bufferRef2[frameIdx % FRAME_NUM][0], &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clCreateBuffer failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                std::vector<cl_uchar> out(frameSize, 0);
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    error = clEnqueueCopyBufferToImage(
+                        cmdQueue, buffer, memObjList[i], offset, origin,
+                        regionPlane, 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueCopyBufferToImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE,
+                                               origin, regionPlane, 0, 0,
+                                               &out[offset], 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueReadImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight * sizeof(cl_uchar);
+                }
+
+                if (!YUVCompare(surfaceFormat, out,
+                                bufferRef2[frameIdx % FRAME_NUM], width,
+                                height))
+                {
+                    log_error("Frame idx: %i, OCL image verification after "
+                              "clEnqueueCopyBufferToImage (from OCL buffer to "
+                              "shared OCL image) failed\n",
+                              frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // map operation to read
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                std::vector<cl_uchar> out(frameSize, 0);
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+                    size_t pitchSize =
+                        ((surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)
+                             ? width
+                             : planeWidth);
+
+                    size_t rowPitch = 0;
+                    size_t slicePitch = 0;
+                    void *mapPtr = clEnqueueMapImage(
+                        cmdQueue, memObjList[i], CL_TRUE, CL_MAP_READ, origin,
+                        regionPlane, &rowPitch, &slicePitch, 0, 0, 0, &error);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueMapImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    for (size_t y = 0; y < planeHeight; ++y)
+                        memcpy(&out[offset + y * pitchSize],
+                               static_cast<cl_uchar *>(mapPtr)
+                                   + y * rowPitch / sizeof(cl_uchar),
+                               pitchSize * sizeof(cl_uchar));
+
+                    error = clEnqueueUnmapMemObject(cmdQueue, memObjList[i],
+                                                    mapPtr, 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueUnmapMemObject failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += pitchSize * planeHeight;
+                }
+
+                if (!YUVCompare(surfaceFormat, out,
+                                bufferRef2[frameIdx % FRAME_NUM], width,
+                                height))
+                {
+                    log_error("Frame idx: %i, Mapped shared OCL image is "
+                              "different then expected\n",
+                              frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // map operation to write
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+                    size_t pitchSize =
+                        ((surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)
+                             ? width
+                             : planeWidth);
+
+                    size_t rowPitch = 0;
+                    size_t slicePitch = 0;
+                    void *mapPtr = clEnqueueMapImage(
+                        cmdQueue, memObjList[i], CL_TRUE, CL_MAP_WRITE, origin,
+                        regionPlane, &rowPitch, &slicePitch, 0, 0, 0, &error);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueMapImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    for (size_t y = 0; y < planeHeight; ++y)
+                        memcpy(static_cast<cl_uchar *>(mapPtr)
+                                   + y * rowPitch / sizeof(cl_uchar),
+                               &bufferRef3[frameIdx % FRAME_NUM]
+                                          [offset + y * pitchSize],
+                               pitchSize * sizeof(cl_uchar));
+
+                    error = clEnqueueUnmapMemObject(cmdQueue, memObjList[i],
+                                                    mapPtr, 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueUnmapMemObject failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += pitchSize * planeHeight;
+                }
+            }
+
+            error = clEnqueueReleaseDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            std::vector<cl_uchar> bufferOut(frameSize, 0);
+            if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut, width,
+                               height))
+            {
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            if (!YUVCompare(surfaceFormat, bufferOut,
+                            bufferRef3[frameIdx % FRAME_NUM], width, height))
+            {
+                log_error(
+                    "Frame idx: %i, media surface is different than expected\n",
+                    frameIdx);
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+    }
+
+    if (deviceWrapper->Status() != DEVICE_PASS)
+    {
+        std::string adapterName;
+        AdapterToString(adapterType, adapterName);
+        if (deviceWrapper->Status() == DEVICE_FAIL)
+        {
+            log_error("%s init failed\n", adapterName.c_str());
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+        else
+        {
+            log_error("%s init incomplete due to unsupported device\n",
+                      adapterName.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return result.Result();
+}
+
+int test_api(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
+{
+    CResult result;
+
+#if defined(_WIN32)
+    // D3D9
+    if (api_functions(deviceID, context, queue, num_elements, 10, 256, 256,
+                      CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (api_functions(deviceID, context, queue, num_elements, 3, 512, 256,
+                      CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // D3D9EX
+    if (api_functions(deviceID, context, queue, num_elements, 5, 256, 512,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (api_functions(deviceID, context, queue, num_elements, 7, 512, 256,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, NV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (api_functions(deviceID, context, queue, num_elements, 10, 256, 256,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (api_functions(deviceID, context, queue, num_elements, 15, 128, 128,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, YV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // DXVA
+    if (api_functions(deviceID, context, queue, num_elements, 20, 128, 128,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (api_functions(deviceID, context, queue, num_elements, 40, 64, 64,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, NV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (api_functions(deviceID, context, queue, num_elements, 5, 512, 512,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (api_functions(deviceID, context, queue, num_elements, 2, 1024, 1024,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, YV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+#else
+    return TEST_NOT_IMPLEMENTED;
+#endif
+
+    return result.Result();
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_kernel.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_kernel.cpp
new file mode 100644
index 0000000000..a204440dd6
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_functions_kernel.cpp
@@ -0,0 +1,541 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "harness/errorHelpers.h"
+#include "harness/kernelHelpers.h"
+
+#include "utils.h"
+
+int kernel_functions(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements,
+                     unsigned int iterationNum, unsigned int width,
+                     unsigned int height,
+                     cl_dx9_media_adapter_type_khr adapterType,
+                     TSurfaceFormat surfaceFormat,
+                     TSharedHandleType sharedHandle)
+{
+    const unsigned int FRAME_NUM = 2;
+    const cl_uchar MAX_VALUE = 255 / 2;
+    const std::string PROGRAM_STR =
+        "__kernel void TestFunction( read_only image2d_t planeIn, write_only "
+        "image2d_t planeOut, " NL "                            sampler_t "
+        "sampler, __global int *planeRes)" NL "{" NL
+        "  int w = get_global_id(0);" NL "  int h = get_global_id(1);" NL
+        "  int width = get_image_width(planeIn);" NL
+        "  int height = get_image_height(planeOut);" NL
+        "  float4 color0 = read_imagef(planeIn, sampler, (int2)(w,h)) + "
+        "0.2f;" NL "  float4 color1 = read_imagef(planeIn, sampler, "
+        "(float2)(w,h)) + 0.2f;" NL
+        "  color0 = (color0 == color1) ? color0: (float4)(0.5, 0.5, 0.5, "
+        "0.5);" NL "  write_imagef(planeOut, (int2)(w,h), color0);" NL
+        "  if(w == 0 && h == 0)" NL "  {" NL "    planeRes[0] = width;" NL
+        "    planeRes[1] = height;" NL "  }" NL "}";
+
+    CResult result;
+
+    std::auto_ptr<CDeviceWrapper> deviceWrapper;
+    if (!DeviceCreate(adapterType, deviceWrapper))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    std::vector<std::vector<cl_uchar>> bufferIn(FRAME_NUM);
+    std::vector<std::vector<cl_uchar>> bufferExp(FRAME_NUM);
+    size_t frameSize = width * height * 3 / 2;
+    cl_uchar step = MAX_VALUE / FRAME_NUM;
+    for (size_t i = 0; i < FRAME_NUM; ++i)
+    {
+        if (!YUVGenerate(surfaceFormat, bufferIn[i], width, height,
+                         static_cast<cl_uchar>(step * i),
+                         static_cast<cl_uchar>(step * (i + 1)))
+            || !YUVGenerate(surfaceFormat, bufferExp[i], width, height,
+                            static_cast<cl_uchar>(step * i),
+                            static_cast<cl_uchar>(step * (i + 1)), 0.2))
+        {
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+    }
+
+    while (deviceWrapper->AdapterNext())
+    {
+        cl_int error;
+        // check if the test can be run on the adapter
+        if (CL_SUCCESS
+            != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType,
+                                             deviceWrapper->Device(), result,
+                                             sharedHandle)))
+        {
+            return result.Result();
+        }
+
+        if (surfaceFormat != SURFACE_FORMAT_NV12
+            && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
+        {
+            std::string sharedHandleStr =
+                (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+            std::string formatStr;
+            std::string adapterStr;
+            SurfaceFormatToString(surfaceFormat, formatStr);
+            AdapterToString(adapterType, adapterStr);
+            log_info(
+                "Skipping test case, image format is not supported by a device "
+                "(adapter type: %s, format: %s, shared handle: %s)\n",
+                adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
+            return result.Result();
+        }
+
+        void *objectSrcHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surfaceSrc;
+        if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat,
+                                *deviceWrapper, surfaceSrc,
+                                (sharedHandle == SHARED_HANDLE_ENABLED) ? true
+                                                                        : false,
+                                &objectSrcHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        void *objectDstHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surfaceDst;
+        if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat,
+                                *deviceWrapper, surfaceDst,
+                                (sharedHandle == SHARED_HANDLE_ENABLED) ? true
+                                                                        : false,
+                                &objectDstHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        cl_context_properties contextProperties[] = {
+            CL_CONTEXT_PLATFORM,
+            (cl_context_properties)gPlatformIDdetected,
+            AdapterTypeToContextInfo(adapterType),
+            (cl_context_properties)deviceWrapper->Device(),
+            0,
+        };
+
+        clContextWrapper ctx = clCreateContext(
+            &contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clCreateContext failed: %s\n", IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+#if defined(_WIN32)
+        cl_dx9_surface_info_khr surfaceInfoSrc;
+        surfaceInfoSrc.resource =
+            *(static_cast<CD3D9SurfaceWrapper *>(surfaceSrc.get()));
+        surfaceInfoSrc.shared_handle = objectSrcHandle;
+
+        cl_dx9_surface_info_khr surfaceInfoDst;
+        surfaceInfoDst.resource =
+            *(static_cast<CD3D9SurfaceWrapper *>(surfaceDst.get()));
+        surfaceInfoDst.shared_handle = objectDstHandle;
+#else
+        void *surfaceInfoSrc = 0;
+        void *surfaceInfoDst = 0;
+        return TEST_NOT_IMPLEMENTED;
+#endif
+
+        std::vector<cl_mem> memObjSrcList;
+        std::vector<cl_mem> memObjDstList;
+        unsigned int planesNum = PlanesNum(surfaceFormat);
+        std::vector<clMemWrapper> planeSrcList(planesNum);
+        std::vector<clMemWrapper> planeDstList(planesNum);
+        for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
+        {
+            planeSrcList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfoSrc, planeIdx,
+                &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error(
+                    "clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n",
+                    planeIdx, IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+            memObjSrcList.push_back(planeSrcList[planeIdx]);
+
+            planeDstList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfoDst, planeIdx,
+                &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error(
+                    "clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n",
+                    planeIdx, IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+            memObjDstList.push_back(planeDstList[planeIdx]);
+        }
+
+        clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(
+            ctx, gDeviceIDdetected, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("Unable to create command queue: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!ImageInfoVerify(adapterType, memObjSrcList, width, height,
+                             surfaceSrc, objectSrcHandle))
+        {
+            log_error("Image info verification failed\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        for (size_t frameIdx = 0; frameIdx < iterationNum; ++frameIdx)
+        {
+            if (!YUVSurfaceSet(surfaceFormat, surfaceSrc,
+                               bufferIn[frameIdx % FRAME_NUM], width, height))
+            {
+                result.ResultSub(CResult::TEST_ERROR);
+                return result.Result();
+            }
+
+            error = clEnqueueAcquireDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjSrcList.size()),
+                &memObjSrcList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            error = clEnqueueAcquireDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjDstList.size()),
+                &memObjDstList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            clSamplerWrapper sampler = clCreateSampler(
+                ctx, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error("Unable to create sampler\n");
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            clProgramWrapper program;
+            clKernelWrapper kernel;
+            const char *progPtr = PROGRAM_STR.c_str();
+            if (create_single_kernel_helper(ctx, &program, &kernel, 1,
+                                            (const char **)&progPtr,
+                                            "TestFunction"))
+                result.ResultSub(CResult::TEST_FAIL);
+
+            size_t bufferSize = sizeof(cl_int) * 2;
+            clMemWrapper imageRes = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
+                                                   bufferSize, NULL, &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clCreateBuffer failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            size_t offset = 0;
+            size_t origin[3] = { 0, 0, 0 };
+            std::vector<cl_uchar> out(frameSize, 0);
+            for (size_t i = 0; i < memObjSrcList.size(); ++i)
+            {
+                size_t planeWidth = (i == 0) ? width : width / 2;
+                size_t planeHeight = (i == 0) ? height : height / 2;
+                size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+                size_t threads[2] = { planeWidth, planeHeight };
+
+                error = clSetKernelArg(kernel, 0, sizeof(memObjSrcList[i]),
+                                       &memObjSrcList[i]);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to set kernel arguments");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error = clSetKernelArg(kernel, 1, sizeof(memObjDstList[i]),
+                                       &memObjDstList[i]);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to set kernel arguments");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error = clSetKernelArg(kernel, 2, sizeof(sampler), &sampler);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to set kernel arguments");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error = clSetKernelArg(kernel, 3, sizeof(imageRes), &imageRes);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to set kernel arguments");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                size_t localThreads[2];
+                error = get_max_common_2D_work_group_size(ctx, kernel, threads,
+                                                          localThreads);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to get work group size to use");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error =
+                    clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, threads,
+                                           localThreads, 0, NULL, NULL);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to execute test kernel");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                std::vector<cl_uint> imageResOut(2, 0);
+                error = clEnqueueReadBuffer(cmdQueue, imageRes, CL_TRUE, 0,
+                                            bufferSize, &imageResOut[0], 0,
+                                            NULL, NULL);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to read buffer");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                if (imageResOut[0] != planeWidth)
+                {
+                    log_error("Invalid width value, test = %i, expected = %i\n",
+                              imageResOut[0], planeWidth);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                if (imageResOut[1] != planeHeight)
+                {
+                    log_error(
+                        "Invalid height value, test = %i, expected = %i\n",
+                        imageResOut[1], planeHeight);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error = clEnqueueReadImage(cmdQueue, memObjDstList[i], CL_TRUE,
+                                           origin, regionPlane, 0, 0,
+                                           &out[offset], 0, 0, 0);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clEnqueueReadImage failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                offset += planeWidth * planeHeight;
+            }
+
+            if (!YUVCompare(surfaceFormat, out, bufferExp[frameIdx % FRAME_NUM],
+                            width, height))
+            {
+                log_error(
+                    "Frame idx: %i, OCL objects are different than expected\n",
+                    frameIdx);
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            error = clEnqueueReleaseDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjSrcList.size()),
+                &memObjSrcList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            error = clEnqueueReleaseDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjDstList.size()),
+                &memObjDstList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            std::vector<cl_uchar> bufferOut(frameSize, 0);
+            if (!YUVSurfaceGet(surfaceFormat, surfaceDst, bufferOut, width,
+                               height))
+            {
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            if (!YUVCompare(surfaceFormat, bufferOut,
+                            bufferExp[frameIdx % FRAME_NUM], width, height))
+            {
+                log_error(
+                    "Frame idx: %i, media surface is different than expected\n",
+                    frameIdx);
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+    }
+
+    if (deviceWrapper->Status() != DEVICE_PASS)
+    {
+        std::string adapterName;
+        AdapterToString(adapterType, adapterName);
+        if (deviceWrapper->Status() == DEVICE_FAIL)
+        {
+            log_error("%s init failed\n", adapterName.c_str());
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+        else
+        {
+            log_error("%s init incomplete due to unsupported device\n",
+                      adapterName.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return result.Result();
+}
+
+int test_kernel(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
+{
+    CResult result;
+
+#if defined(_WIN32)
+    // D3D9
+    if (kernel_functions(deviceID, context, queue, num_elements, 10, 256, 256,
+                         CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_NV12,
+                         SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (kernel_functions(deviceID, context, queue, num_elements, 3, 256, 256,
+                         CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_YV12,
+                         SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // D3D9EX
+    if (kernel_functions(deviceID, context, queue, num_elements, 5, 256, 512,
+                         CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_NV12,
+                         SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (kernel_functions(deviceID, context, queue, num_elements, 7, 512, 256,
+                         CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_NV12,
+                         SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, NV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (kernel_functions(deviceID, context, queue, num_elements, 10, 256, 256,
+                         CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_YV12,
+                         SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (kernel_functions(deviceID, context, queue, num_elements, 15, 128, 128,
+                         CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_YV12,
+                         SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, YV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // DXVA
+    if (kernel_functions(deviceID, context, queue, num_elements, 20, 128, 128,
+                         CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_NV12,
+                         SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (kernel_functions(deviceID, context, queue, num_elements, 40, 64, 64,
+                         CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_NV12,
+                         SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, NV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (kernel_functions(deviceID, context, queue, num_elements, 5, 512, 512,
+                         CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_YV12,
+                         SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (kernel_functions(deviceID, context, queue, num_elements, 2, 1024, 1024,
+                         CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_YV12,
+                         SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, YV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+#else
+    return TEST_NOT_IMPLEMENTED;
+#endif
+
+    return result.Result();
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/test_get_device_ids.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_get_device_ids.cpp
new file mode 100644
index 0000000000..613a602c69
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_get_device_ids.cpp
@@ -0,0 +1,220 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "utils.h"
+
+int get_device_ids(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements,
+                   cl_dx9_media_adapter_type_khr adapterType)
+{
+    CResult result;
+
+    std::auto_ptr<CDeviceWrapper> deviceWrapper;
+    if (!DeviceCreate(adapterType, deviceWrapper))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    cl_uint devicesExpectedNum = 0;
+    cl_int error = clGetDeviceIDs(gPlatformIDdetected, CL_DEVICE_TYPE_ALL, 0, 0,
+                                  &devicesExpectedNum);
+    if (error != CL_SUCCESS || devicesExpectedNum < 1)
+    {
+        log_error("clGetDeviceIDs failed: %s\n", IGetErrorString(error));
+        result.ResultSub(CResult::TEST_FAIL);
+        return result.Result();
+    }
+
+    std::vector<cl_device_id> devicesExpected(devicesExpectedNum);
+    error = clGetDeviceIDs(gPlatformIDdetected, CL_DEVICE_TYPE_ALL,
+                           devicesExpectedNum, &devicesExpected[0], 0);
+    if (error != CL_SUCCESS)
+    {
+        log_error("clGetDeviceIDs failed: %s\n", IGetErrorString(error));
+        result.ResultSub(CResult::TEST_FAIL);
+        return result.Result();
+    }
+
+    while (deviceWrapper->AdapterNext())
+    {
+        std::vector<cl_dx9_media_adapter_type_khr> mediaAdapterTypes;
+        mediaAdapterTypes.push_back(adapterType);
+
+        std::vector<void *> mediaDevices;
+        mediaDevices.push_back(deviceWrapper->Device());
+
+        // check if the test can be run on the adapter
+        if (CL_SUCCESS
+            != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType,
+                                             deviceWrapper->Device(), result)))
+        {
+            return result.Result();
+        }
+
+        cl_uint devicesAllNum = 0;
+        error = clGetDeviceIDsFromDX9MediaAdapterKHR(
+            gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
+            CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 0, 0, &devicesAllNum);
+        if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND)
+        {
+            log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        std::vector<cl_device_id> devicesAll;
+        if (devicesAllNum > 0)
+        {
+            devicesAll.resize(devicesAllNum);
+            error = clGetDeviceIDsFromDX9MediaAdapterKHR(
+                gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
+                CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, devicesAllNum,
+                &devicesAll[0], 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+        }
+
+        cl_uint devicesPreferredNum = 0;
+        error = clGetDeviceIDsFromDX9MediaAdapterKHR(
+            gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
+            CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 0, 0,
+            &devicesPreferredNum);
+        if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND)
+        {
+            log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        std::vector<cl_device_id> devicesPreferred;
+        if (devicesPreferredNum > 0)
+        {
+            devicesPreferred.resize(devicesPreferredNum);
+            error = clGetDeviceIDsFromDX9MediaAdapterKHR(
+                gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
+                CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR,
+                devicesPreferredNum, &devicesPreferred[0], 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+        }
+
+        if (devicesAllNum < devicesPreferredNum)
+        {
+            log_error("Invalid number of preferred devices. It should be a "
+                      "subset of all devices\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        for (cl_uint i = 0; i < devicesPreferredNum; ++i)
+        {
+            cl_uint j = 0;
+            for (; j < devicesAllNum; ++j)
+            {
+                if (devicesPreferred[i] == devicesAll[j]) break;
+            }
+
+            if (j == devicesAllNum)
+            {
+                log_error("Preferred device is not a subset of all devices\n");
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+
+        for (cl_uint i = 0; i < devicesAllNum; ++i)
+        {
+            cl_uint j = 0;
+            for (; j < devicesExpectedNum; ++j)
+            {
+                if (devicesAll[i] == devicesExpected[j]) break;
+            }
+
+            if (j == devicesExpectedNum)
+            {
+                log_error("CL_ALL_DEVICES_FOR_MEDIA_ADAPTER_KHR should be a "
+                          "subset of all devices for selected platform\n");
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+    }
+
+    if (deviceWrapper->Status() != DEVICE_PASS)
+    {
+        std::string adapterName;
+        AdapterToString(adapterType, adapterName);
+        if (deviceWrapper->Status() == DEVICE_FAIL)
+        {
+            log_error("%s init failed\n", adapterName.c_str());
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+        else
+        {
+            log_error("%s init incomplete due to unsupported device\n",
+                      adapterName.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return result.Result();
+}
+
+int test_get_device_ids(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
+{
+    CResult result;
+
+#if defined(_WIN32)
+    if (get_device_ids(deviceID, context, queue, num_elements,
+                       CL_ADAPTER_D3D9_KHR)
+        != 0)
+    {
+        log_error("\nTest case (D3D9) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (get_device_ids(deviceID, context, queue, num_elements,
+                       CL_ADAPTER_D3D9EX_KHR)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (get_device_ids(deviceID, context, queue, num_elements,
+                       CL_ADAPTER_DXVA_KHR)
+        != 0)
+    {
+        log_error("\nTest case (DXVA) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+#else
+    return TEST_NOT_IMPLEMENTED;
+#endif
+
+    return result.Result();
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/test_interop_sync.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_interop_sync.cpp
new file mode 100644
index 0000000000..fbc616e2bf
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_interop_sync.cpp
@@ -0,0 +1,419 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "utils.h"
+
+int interop_user_sync(cl_device_id deviceID, cl_context context,
+                      cl_command_queue queue, int num_elements,
+                      unsigned int width, unsigned int height,
+                      TContextFuncType functionCreate,
+                      cl_dx9_media_adapter_type_khr adapterType,
+                      TSurfaceFormat surfaceFormat,
+                      TSharedHandleType sharedHandle, cl_bool userSync)
+{
+    CResult result;
+
+    // create device
+    std::auto_ptr<CDeviceWrapper> deviceWrapper;
+    if (!DeviceCreate(adapterType, deviceWrapper))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    // generate input data
+    std::vector<cl_uchar> bufferIn(width * height * 3 / 2, 0);
+    if (!YUVGenerate(surfaceFormat, bufferIn, width, height, 0, 255))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    while (deviceWrapper->AdapterNext())
+    {
+        cl_int error;
+        // check if the test can be run on the adapter
+        if (CL_SUCCESS
+            != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType,
+                                             deviceWrapper->Device(), result,
+                                             sharedHandle)))
+        {
+            return result.Result();
+        }
+
+        if (surfaceFormat != SURFACE_FORMAT_NV12
+            && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
+        {
+            std::string sharedHandleStr =
+                (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+            std::string syncStr = (userSync == CL_TRUE) ? "yes" : "no";
+            std::string formatStr;
+            std::string adapterStr;
+            SurfaceFormatToString(surfaceFormat, formatStr);
+            AdapterToString(adapterType, adapterStr);
+            log_info("Skipping test case, image format is not supported by a "
+                     "device (adapter type: %s, format: %s, shared handle: %s, "
+                     "user sync: %s)\n",
+                     adapterStr.c_str(), formatStr.c_str(),
+                     sharedHandleStr.c_str(), syncStr.c_str());
+            return result.Result();
+        }
+
+        void *objectSharedHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surface;
+        if (!MediaSurfaceCreate(
+                adapterType, width, height, surfaceFormat, *deviceWrapper,
+                surface, (sharedHandle == SHARED_HANDLE_ENABLED) ? true : false,
+                &objectSharedHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        cl_context_properties contextProperties[] = {
+            CL_CONTEXT_PLATFORM,
+            (cl_context_properties)gPlatformIDdetected,
+            AdapterTypeToContextInfo(adapterType),
+            (cl_context_properties)deviceWrapper->Device(),
+            CL_CONTEXT_INTEROP_USER_SYNC,
+            userSync,
+            0,
+        };
+
+
+        clContextWrapper ctx;
+        switch (functionCreate)
+        {
+            case CONTEXT_CREATE_DEFAULT:
+                ctx = clCreateContext(&contextProperties[0], 1,
+                                      &gDeviceIDdetected, NULL, NULL, &error);
+                break;
+            case CONTEXT_CREATE_FROM_TYPE:
+                ctx = clCreateContextFromType(&contextProperties[0],
+                                              gDeviceTypeSelected, NULL, NULL,
+                                              &error);
+                break;
+            default:
+                log_error("Unknown context creation function enum\n");
+                result.ResultSub(CResult::TEST_ERROR);
+                return result.Result();
+                break;
+        }
+
+        if (error != CL_SUCCESS)
+        {
+            std::string functionName;
+            FunctionContextCreateToString(functionCreate, functionName);
+            log_error("%s failed: %s\n", functionName.c_str(),
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!YUVSurfaceSet(surfaceFormat, surface, bufferIn, width, height))
+        {
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+#if defined(_WIN32)
+        cl_dx9_surface_info_khr surfaceInfo;
+        surfaceInfo.resource =
+            *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
+        surfaceInfo.shared_handle = objectSharedHandle;
+#else
+        void *surfaceInfo = 0;
+        return TEST_NOT_IMPLEMENTED;
+#endif
+
+        std::vector<cl_mem> memObjList;
+        unsigned int planesNum = PlanesNum(surfaceFormat);
+        std::vector<clMemWrapper> planesList(planesNum);
+        for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
+        {
+            planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx,
+                &error);
+            if (error != CL_SUCCESS)
+            {
+                log_error(
+                    "clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n",
+                    planeIdx, IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+            memObjList.push_back(planesList[planeIdx]);
+        }
+
+        clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(
+            ctx, gDeviceIDdetected, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("Unable to create command queue: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!ImageInfoVerify(adapterType, memObjList, width, height, surface,
+                             objectSharedHandle))
+        {
+            log_error("Image info verification failed\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        if (userSync == CL_TRUE)
+        {
+#if defined(_WIN32)
+            IDirect3DQuery9 *eventQuery = NULL;
+            switch (adapterType)
+            {
+                case CL_ADAPTER_D3D9_KHR: {
+                    LPDIRECT3DDEVICE9 device =
+                        (LPDIRECT3DDEVICE9)deviceWrapper->Device();
+                    device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery);
+                    eventQuery->Issue(D3DISSUE_END);
+
+                    while (S_FALSE
+                           == eventQuery->GetData(NULL, 0, D3DGETDATA_FLUSH))
+                        ;
+                }
+                break;
+                case CL_ADAPTER_D3D9EX_KHR: {
+                    LPDIRECT3DDEVICE9EX device =
+                        (LPDIRECT3DDEVICE9EX)deviceWrapper->Device();
+                    device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery);
+                    eventQuery->Issue(D3DISSUE_END);
+
+                    while (S_FALSE
+                           == eventQuery->GetData(NULL, 0, D3DGETDATA_FLUSH))
+                        ;
+                }
+                break;
+                case CL_ADAPTER_DXVA_KHR: {
+                    CDXVAWrapper *DXVADevice =
+                        dynamic_cast<CDXVAWrapper *>(&(*deviceWrapper));
+                    LPDIRECT3DDEVICE9EX device =
+                        (LPDIRECT3DDEVICE9EX)(DXVADevice->D3D9()).Device();
+                    device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery);
+                    eventQuery->Issue(D3DISSUE_END);
+
+                    while (S_FALSE
+                           == eventQuery->GetData(NULL, 0, D3DGETDATA_FLUSH))
+                        ;
+                }
+                break;
+                default:
+                    log_error("Unknown adapter type\n");
+                    return false;
+                    break;
+            }
+            if (eventQuery)
+            {
+                eventQuery->Release();
+            }
+#else
+            return TEST_NOT_IMPLEMENTED;
+#endif
+        }
+
+        error = clEnqueueAcquireDX9MediaSurfacesKHR(
+            cmdQueue, static_cast<cl_uint>(memObjList.size()),
+            &memObjList.at(0), 0, 0, 0);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        size_t origin[3] = { 0, 0, 0 };
+        size_t offset = 0;
+        size_t frameSize = width * height * 3 / 2;
+        std::vector<cl_uchar> out(frameSize, 0);
+        for (size_t i = 0; i < memObjList.size(); ++i)
+        {
+            size_t planeWidth = (i == 0) ? width : width / 2;
+            size_t planeHeight = (i == 0) ? height : height / 2;
+            size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+            error =
+                clEnqueueReadImage(cmdQueue, memObjList.at(i), CL_TRUE, origin,
+                                   regionPlane, 0, 0, &out.at(offset), 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReadImage failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            offset += planeWidth * planeHeight;
+        }
+
+        if (!YUVCompare(surfaceFormat, out, bufferIn, width, height))
+        {
+            log_error("OCL object verification failed - clEnqueueReadImage\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        error = clEnqueueReleaseDX9MediaSurfacesKHR(
+            cmdQueue, static_cast<cl_uint>(memObjList.size()),
+            &memObjList.at(0), 0, 0, 0);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        if (userSync == CL_TRUE)
+        {
+            error = clFinish(cmdQueue);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clFinish failed: %s\n", IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+
+        // shared object verification
+        std::vector<cl_uchar> bufferOut(frameSize, 0);
+        if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut, width, height))
+        {
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!YUVCompare(surfaceFormat, bufferOut, bufferIn, width, height))
+        {
+            log_error("Media surface is different than expected\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+    }
+
+    if (deviceWrapper->Status() != DEVICE_PASS)
+    {
+        std::string adapterName;
+        AdapterToString(adapterType, adapterName);
+
+        if (deviceWrapper->Status() == DEVICE_FAIL)
+        {
+            log_error("%s init failed\n", adapterName.c_str());
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+        else
+        {
+            log_error("%s init incomplete due to unsupported device\n",
+                      adapterName.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return result.Result();
+}
+
+int test_interop_user_sync(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
+{
+    const unsigned int WIDTH = 256;
+    const unsigned int HEIGHT = 256;
+
+    std::vector<cl_dx9_media_adapter_type_khr> adapters;
+#if defined(_WIN32)
+    adapters.push_back(CL_ADAPTER_D3D9_KHR);
+    adapters.push_back(CL_ADAPTER_D3D9EX_KHR);
+    adapters.push_back(CL_ADAPTER_DXVA_KHR);
+#else
+    return TEST_NOT_IMPLEMENTED;
+#endif
+
+    std::vector<TContextFuncType> contextFuncs;
+    contextFuncs.push_back(CONTEXT_CREATE_DEFAULT);
+    contextFuncs.push_back(CONTEXT_CREATE_FROM_TYPE);
+
+    std::vector<TSurfaceFormat> formats;
+    formats.push_back(SURFACE_FORMAT_NV12);
+    formats.push_back(SURFACE_FORMAT_YV12);
+
+    std::vector<TSharedHandleType> sharedHandleTypes;
+    sharedHandleTypes.push_back(SHARED_HANDLE_DISABLED);
+    sharedHandleTypes.push_back(SHARED_HANDLE_ENABLED);
+
+    std::vector<cl_bool> sync;
+    sync.push_back(CL_FALSE);
+    sync.push_back(CL_TRUE);
+
+    CResult result;
+    for (size_t adapterIdx = 0; adapterIdx < adapters.size(); ++adapterIdx)
+    {
+        // iteration through all create context functions
+        for (size_t contextFuncIdx = 0; contextFuncIdx < contextFuncs.size();
+             ++contextFuncIdx)
+        {
+            // iteration through YUV formats
+            for (size_t formatIdx = 0; formatIdx < formats.size(); ++formatIdx)
+            {
+                // shared handle enabled or disabled
+                for (size_t sharedHandleIdx = 0;
+                     sharedHandleIdx < sharedHandleTypes.size();
+                     ++sharedHandleIdx)
+                {
+                    // user sync interop disabled or enabled
+                    for (size_t syncIdx = 0; syncIdx < sync.size(); ++syncIdx)
+                    {
+                        if (adapters[adapterIdx] == CL_ADAPTER_D3D9_KHR
+                            && sharedHandleTypes[sharedHandleIdx]
+                                == SHARED_HANDLE_ENABLED)
+                            continue;
+
+                        if (interop_user_sync(
+                                deviceID, context, queue, num_elements, WIDTH,
+                                HEIGHT, contextFuncs[contextFuncIdx],
+                                adapters[adapterIdx], formats[formatIdx],
+                                sharedHandleTypes[sharedHandleIdx],
+                                sync[syncIdx])
+                            != 0)
+                        {
+                            std::string syncStr = (sync[syncIdx] == CL_TRUE)
+                                ? "user sync enabled"
+                                : "user sync disabled";
+                            std::string sharedHandle =
+                                (sharedHandleTypes[sharedHandleIdx]
+                                 == SHARED_HANDLE_ENABLED)
+                                ? "shared handle"
+                                : "no shared handle";
+                            std::string adapterStr;
+                            std::string formatStr;
+                            SurfaceFormatToString(formats[formatIdx],
+                                                  formatStr);
+                            AdapterToString(adapters[adapterIdx], adapterStr);
+
+                            log_error("\nTest case - clCreateContext (%s, %s, "
+                                      "%s, %s) failed\n\n",
+                                      adapterStr.c_str(), formatStr.c_str(),
+                                      sharedHandle.c_str(), syncStr.c_str());
+                            result.ResultSub(CResult::TEST_FAIL);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return result.Result();
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/test_memory_access.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_memory_access.cpp
new file mode 100644
index 0000000000..1e4e2c4ebc
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_memory_access.cpp
@@ -0,0 +1,549 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "utils.h"
+
+int memory_access(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements, unsigned int width,
+                  unsigned int height,
+                  cl_dx9_media_adapter_type_khr adapterType,
+                  TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
+{
+    CResult result;
+
+    std::auto_ptr<CDeviceWrapper> deviceWrapper;
+    // creates device
+    if (!DeviceCreate(adapterType, deviceWrapper))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    // generate input and expected data
+    size_t frameSize = width * height * 3 / 2;
+    std::vector<cl_uchar> bufferRef0(frameSize, 0);
+    std::vector<cl_uchar> bufferRef1(frameSize, 0);
+    std::vector<cl_uchar> bufferRef2(frameSize, 0);
+    if (!YUVGenerate(surfaceFormat, bufferRef0, width, height, 0, 90)
+        || !YUVGenerate(surfaceFormat, bufferRef1, width, height, 91, 180)
+        || !YUVGenerate(surfaceFormat, bufferRef2, width, height, 181, 255))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    // iterates through all devices
+    while (deviceWrapper->AdapterNext())
+    {
+        cl_int error;
+        // check if the test can be run on the adapter
+        if (CL_SUCCESS
+            != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType,
+                                             deviceWrapper->Device(), result,
+                                             sharedHandle)))
+        {
+            return result.Result();
+        }
+
+        if (surfaceFormat != SURFACE_FORMAT_NV12
+            && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
+        {
+            std::string sharedHandleStr =
+                (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+            std::string formatStr;
+            std::string adapterStr;
+            SurfaceFormatToString(surfaceFormat, formatStr);
+            AdapterToString(adapterType, adapterStr);
+            log_info(
+                "Skipping test case, image format is not supported by a device "
+                "(adapter type: %s, format: %s, shared handle: %s)\n",
+                adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
+            return result.Result();
+        }
+
+        void *objectSharedHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surface;
+
+        // creates surface
+        if (!MediaSurfaceCreate(
+                adapterType, width, height, surfaceFormat, *deviceWrapper,
+                surface, (sharedHandle == SHARED_HANDLE_ENABLED) ? true : false,
+                &objectSharedHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        if (!YUVSurfaceSet(surfaceFormat, surface, bufferRef0, width, height))
+        {
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        cl_context_properties contextProperties[] = {
+            CL_CONTEXT_PLATFORM,
+            (cl_context_properties)gPlatformIDdetected,
+            AdapterTypeToContextInfo(adapterType),
+            (cl_context_properties)deviceWrapper->Device(),
+            0,
+        };
+
+        clContextWrapper ctx = clCreateContext(
+            &contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clCreateContext failed: %s\n", IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(
+            ctx, gDeviceIDdetected, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("Unable to create command queue: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        { // memory access write
+#if defined(_WIN32)
+            cl_dx9_surface_info_khr surfaceInfo;
+            surfaceInfo.resource =
+                *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
+            surfaceInfo.shared_handle = objectSharedHandle;
+#else
+            void *surfaceInfo = 0;
+            return TEST_NOT_IMPLEMENTED;
+#endif
+
+            std::vector<cl_mem> memObjList;
+            unsigned int planesNum = PlanesNum(surfaceFormat);
+            std::vector<clMemWrapper> planesList(planesNum);
+            for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
+            {
+                planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                    ctx, CL_MEM_WRITE_ONLY, adapterType, &surfaceInfo, planeIdx,
+                    &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clCreateFromDX9MediaSurfaceKHR failed for "
+                              "WRITE_ONLY plane %i: %s\n",
+                              planeIdx, IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                    return result.Result();
+                }
+                memObjList.push_back(planesList[planeIdx]);
+            }
+
+            error = clEnqueueAcquireDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            size_t offset = 0;
+            size_t origin[3] = { 0, 0, 0 };
+            for (size_t i = 0; i < memObjList.size(); ++i)
+            {
+                size_t planeWidth = (i == 0) ? width : width / 2;
+                size_t planeHeight = (i == 0) ? height : height / 2;
+                size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                error = clEnqueueWriteImage(cmdQueue, memObjList[i], CL_TRUE,
+                                            origin, regionPlane, 0, 0,
+                                            &bufferRef1[offset], 0, 0, 0);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clEnqueueWriteImage failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                offset += planeWidth * planeHeight;
+            }
+
+            error = clEnqueueReleaseDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+
+        std::vector<cl_uchar> bufferOut0(frameSize, 0);
+        if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut0, width, height))
+        {
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!YUVCompare(surfaceFormat, bufferOut0, bufferRef1, width, height))
+        {
+            log_error("Media surface is different than expected\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        { // memory access read
+#if defined(_WIN32)
+            cl_dx9_surface_info_khr surfaceInfo;
+            surfaceInfo.resource =
+                *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
+            surfaceInfo.shared_handle = objectSharedHandle;
+#else
+            void *surfaceInfo = 0;
+            return TEST_NOT_IMPLEMENTED;
+#endif
+
+            std::vector<cl_mem> memObjList;
+            unsigned int planesNum = PlanesNum(surfaceFormat);
+            std::vector<clMemWrapper> planesList(planesNum);
+            for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
+            {
+                planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                    ctx, CL_MEM_READ_ONLY, adapterType, &surfaceInfo, planeIdx,
+                    &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clCreateFromDX9MediaSurfaceKHR failed for "
+                              "READ_ONLY plane %i: %s\n",
+                              planeIdx, IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                    return result.Result();
+                }
+                memObjList.push_back(planesList[planeIdx]);
+            }
+
+            error = clEnqueueAcquireDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            std::vector<cl_uchar> out(frameSize, 0);
+            size_t offset = 0;
+            size_t origin[3] = { 0, 0, 0 };
+
+            for (size_t i = 0; i < memObjList.size(); ++i)
+            {
+                size_t planeWidth = (i == 0) ? width : width / 2;
+                size_t planeHeight = (i == 0) ? height : height / 2;
+                size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE,
+                                           origin, regionPlane, 0, 0,
+                                           &out[offset], 0, 0, 0);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clEnqueueReadImage failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                offset += planeWidth * planeHeight;
+            }
+
+            if (!YUVCompare(surfaceFormat, out, bufferRef1, width, height))
+            {
+                log_error("OCL image (READ_ONLY) is different then expected\n");
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            error = clEnqueueReleaseDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+
+        std::vector<cl_uchar> bufferOut1(frameSize, 0);
+        if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut1, width, height))
+        {
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!YUVCompare(surfaceFormat, bufferOut1, bufferRef1, width, height))
+        {
+            log_error("Media surface is different than expected\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        { // memory access read write
+#if defined(_WIN32)
+            cl_dx9_surface_info_khr surfaceInfo;
+            surfaceInfo.resource =
+                *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
+            surfaceInfo.shared_handle = objectSharedHandle;
+#else
+            void *surfaceInfo = 0;
+            return TEST_NOT_IMPLEMENTED;
+#endif
+
+            std::vector<cl_mem> memObjList;
+            unsigned int planesNum = PlanesNum(surfaceFormat);
+            std::vector<clMemWrapper> planesList(planesNum);
+            for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
+            {
+                planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(
+                    ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx,
+                    &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clCreateFromDX9MediaSurfaceKHR failed for "
+                              "READ_WRITE plane %i: %s\n",
+                              planeIdx, IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                    return result.Result();
+                }
+                memObjList.push_back(planesList[planeIdx]);
+            }
+
+            error = clEnqueueAcquireDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            { // read
+                std::vector<cl_uchar> out(frameSize, 0);
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE,
+                                               origin, regionPlane, 0, 0,
+                                               &out[offset], 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueReadImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight;
+                }
+
+                if (!YUVCompare(surfaceFormat, out, bufferRef1, width, height))
+                {
+                    log_error(
+                        "OCL image (READ_WRITE) is different then expected\n");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // write
+                size_t offset = 0;
+                size_t origin[3] = { 0, 0, 0 };
+                for (size_t i = 0; i < memObjList.size(); ++i)
+                {
+                    size_t planeWidth = (i == 0) ? width : width / 2;
+                    size_t planeHeight = (i == 0) ? height : height / 2;
+                    size_t regionPlane[3] = { planeWidth, planeHeight, 1 };
+
+                    error = clEnqueueWriteImage(
+                        cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane,
+                        0, 0, &bufferRef2[offset], 0, 0, 0);
+                    if (error != CL_SUCCESS)
+                    {
+                        log_error("clEnqueueWriteImage failed: %s\n",
+                                  IGetErrorString(error));
+                        result.ResultSub(CResult::TEST_FAIL);
+                    }
+
+                    offset += planeWidth * planeHeight;
+                }
+            }
+
+            error = clEnqueueReleaseDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+
+        std::vector<cl_uchar> bufferOut2(frameSize, 0);
+        if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut2, width, height))
+        {
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!YUVCompare(surfaceFormat, bufferOut2, bufferRef2, width, height))
+        {
+            log_error("Media surface is different than expected\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+    }
+
+    if (deviceWrapper->Status() != DEVICE_PASS)
+    {
+        std::string adapterName;
+        AdapterToString(adapterType, adapterName);
+        if (deviceWrapper->Status() == DEVICE_FAIL)
+        {
+            log_error("%s init failed\n", adapterName.c_str());
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+        else
+        {
+            log_error("%s init incomplete due to unsupported device\n",
+                      adapterName.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return result.Result();
+}
+
+int test_memory_access(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements)
+{
+    CResult result;
+
+#if defined(_WIN32)
+    // D3D9
+    if (memory_access(deviceID, context, queue, num_elements, 256, 256,
+                      CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (memory_access(deviceID, context, queue, num_elements, 512, 256,
+                      CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // D3D9EX
+    if (memory_access(deviceID, context, queue, num_elements, 256, 512,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (memory_access(deviceID, context, queue, num_elements, 512, 256,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, NV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (memory_access(deviceID, context, queue, num_elements, 256, 256,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (memory_access(deviceID, context, queue, num_elements, 128, 128,
+                      CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, YV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // DXVA
+    if (memory_access(deviceID, context, queue, num_elements, 128, 128,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, NV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (memory_access(deviceID, context, queue, num_elements, 64, 64,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_NV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, NV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (memory_access(deviceID, context, queue, num_elements, 512, 512,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, YV12, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (memory_access(deviceID, context, queue, num_elements, 1024, 1024,
+                      CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_YV12,
+                      SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, YV12, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+#else
+    return TEST_NOT_IMPLEMENTED;
+#endif
+
+    return result.Result();
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/test_other_data_types.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_other_data_types.cpp
new file mode 100644
index 0000000000..0e5d1d12ac
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/test_other_data_types.cpp
@@ -0,0 +1,1319 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "harness/errorHelpers.h"
+#include "harness/imageHelpers.h"
+#include "harness/kernelHelpers.h"
+
+#include "utils.h"
+
+template <typename T>
+int other_data_types(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements,
+                     unsigned int iterationNum, unsigned int width,
+                     unsigned int height,
+                     cl_dx9_media_adapter_type_khr adapterType,
+                     TSurfaceFormat surfaceFormat,
+                     TSharedHandleType sharedHandle)
+{
+    const unsigned int FRAME_NUM = 2;
+    const float MAX_VALUE = 0.6f;
+    const std::string PROGRAM_STR =
+        "__kernel void TestFunction( read_only image2d_t imageIn, write_only "
+        "image2d_t imageOut, " NL "                            sampler_t "
+        "sampler, __global int *imageRes)" NL "{" NL
+        "  int w = get_global_id(0);" NL "  int h = get_global_id(1);" NL
+        "  int width = get_image_width(imageIn);" NL
+        "  int height = get_image_height(imageOut);" NL
+        "  float4 color0 = read_imagef(imageIn, sampler, (int2)(w,h)) - "
+        "0.2f;" NL "  float4 color1 = read_imagef(imageIn, sampler, "
+        "(float2)(w,h)) - 0.2f;" NL
+        "  color0 = (color0 == color1) ? color0: (float4)(0.5, 0.5, 0.5, "
+        "0.5);" NL "  write_imagef(imageOut, (int2)(w,h), color0);" NL
+        "  if(w == 0 && h == 0)" NL "  {" NL "    imageRes[0] = width;" NL
+        "    imageRes[1] = height;" NL "  }" NL "}";
+
+    CResult result;
+
+    cl_image_format format;
+    if (!SurfaceFormatToOCL(surfaceFormat, format))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    std::auto_ptr<CDeviceWrapper> deviceWrapper;
+    if (!DeviceCreate(adapterType, deviceWrapper))
+    {
+        result.ResultSub(CResult::TEST_ERROR);
+        return result.Result();
+    }
+
+    while (deviceWrapper->AdapterNext())
+    {
+        cl_int error;
+        // check if the test can be run on the adapter
+        if (CL_SUCCESS
+            != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType,
+                                             deviceWrapper->Device(), result,
+                                             sharedHandle)))
+        {
+            return result.Result();
+        }
+
+        cl_context_properties contextProperties[] = {
+            CL_CONTEXT_PLATFORM,
+            (cl_context_properties)gPlatformIDdetected,
+            AdapterTypeToContextInfo(adapterType),
+            (cl_context_properties)deviceWrapper->Device(),
+            0,
+        };
+
+        clContextWrapper ctx = clCreateContext(
+            &contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clCreateContext failed: %s\n", IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(
+            ctx, gDeviceIDdetected, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("Unable to create command queue: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        if (!SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
+        {
+            std::string sharedHandleStr =
+                (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+            std::string formatStr;
+            std::string adapterStr;
+            SurfaceFormatToString(surfaceFormat, formatStr);
+            AdapterToString(adapterType, adapterStr);
+            log_info(
+                "Skipping test case, image format is not supported by a device "
+                "(adapter type: %s, format: %s, shared handle: %s)\n",
+                adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
+            return result.Result();
+        }
+
+        if (!ImageFormatCheck(ctx, CL_MEM_OBJECT_IMAGE2D, format))
+        {
+            std::string sharedHandleStr =
+                (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+            std::string formatStr;
+            std::string adapterStr;
+            SurfaceFormatToString(surfaceFormat, formatStr);
+            AdapterToString(adapterType, adapterStr);
+            log_info("Skipping test case, image format is not supported by OCL "
+                     "(adapter type: %s, format: %s, shared handle: %s)\n",
+                     adapterStr.c_str(), formatStr.c_str(),
+                     sharedHandleStr.c_str());
+            return result.Result();
+        }
+
+        if (format.image_channel_data_type == CL_HALF_FLOAT)
+        {
+            if (DetectFloatToHalfRoundingMode(cmdQueue))
+            {
+                log_error("Unable to detect rounding mode\n");
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+        }
+
+        std::vector<std::vector<T>> bufferIn(FRAME_NUM);
+        std::vector<std::vector<T>> bufferExp(FRAME_NUM);
+        float step = MAX_VALUE / static_cast<float>(FRAME_NUM);
+        unsigned int planeNum = ChannelNum(surfaceFormat);
+        for (size_t i = 0; i < FRAME_NUM; ++i)
+        {
+            DataGenerate(surfaceFormat, format.image_channel_data_type,
+                         bufferIn[i], width, height, planeNum, step * i,
+                         step * (i + 1));
+            DataGenerate(surfaceFormat, format.image_channel_data_type,
+                         bufferExp[i], width, height, planeNum, step * i,
+                         step * (i + 1), 0.2f);
+        }
+
+        void *objectSrcHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surfaceSrc;
+        if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat,
+                                *deviceWrapper, surfaceSrc,
+                                (sharedHandle == SHARED_HANDLE_ENABLED) ? true
+                                                                        : false,
+                                &objectSrcHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+        void *objectDstHandle = 0;
+        std::auto_ptr<CSurfaceWrapper> surfaceDst;
+        if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat,
+                                *deviceWrapper, surfaceDst,
+                                (sharedHandle == SHARED_HANDLE_ENABLED) ? true
+                                                                        : false,
+                                &objectDstHandle))
+        {
+            log_error("Media surface creation failed for %i adapter\n",
+                      deviceWrapper->AdapterIdx());
+            result.ResultSub(CResult::TEST_ERROR);
+            return result.Result();
+        }
+
+#if defined(_WIN32)
+        cl_dx9_surface_info_khr surfaceSrcInfo;
+        CD3D9SurfaceWrapper *dx9SurfaceSrc =
+            (static_cast<CD3D9SurfaceWrapper *>(surfaceSrc.get()));
+        surfaceSrcInfo.resource = *dx9SurfaceSrc;
+        surfaceSrcInfo.shared_handle = objectSrcHandle;
+
+        cl_dx9_surface_info_khr surfaceDstInfo;
+        CD3D9SurfaceWrapper *dx9SurfaceDst =
+            (static_cast<CD3D9SurfaceWrapper *>(surfaceDst.get()));
+        surfaceDstInfo.resource = *dx9SurfaceDst;
+        surfaceDstInfo.shared_handle = objectDstHandle;
+#else
+        void *surfaceSrcInfo = 0;
+        void *surfaceDstInfo = 0;
+        return TEST_NOT_IMPLEMENTED;
+#endif
+
+        // create OCL shared object
+        clMemWrapper objectSrcShared = clCreateFromDX9MediaSurfaceKHR(
+            ctx, CL_MEM_READ_WRITE, adapterType, &surfaceSrcInfo, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clCreateFromDX9MediaSurfaceKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        clMemWrapper objectDstShared = clCreateFromDX9MediaSurfaceKHR(
+            ctx, CL_MEM_READ_WRITE, adapterType, &surfaceDstInfo, 0, &error);
+        if (error != CL_SUCCESS)
+        {
+            log_error("clCreateFromDX9MediaSurfaceKHR failed: %s\n",
+                      IGetErrorString(error));
+            result.ResultSub(CResult::TEST_FAIL);
+            return result.Result();
+        }
+
+        std::vector<cl_mem> memObjList;
+        memObjList.push_back(objectSrcShared);
+        memObjList.push_back(objectDstShared);
+
+        if (!GetMemObjInfo(objectSrcShared, adapterType, surfaceSrc,
+                           objectSrcHandle))
+        {
+            log_error("Invalid memory object info\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        if (!GetImageInfo(objectSrcShared, format, sizeof(T) * planeNum,
+                          width * sizeof(T) * planeNum, 0, width, height, 0, 0))
+        {
+            log_error("clGetImageInfo failed\n");
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+
+        for (size_t frameIdx = 0; frameIdx < iterationNum; ++frameIdx)
+        {
+            // surface set
+#if defined(_WIN32)
+            D3DLOCKED_RECT rect;
+            if (FAILED((*dx9SurfaceSrc)->LockRect(&rect, NULL, 0)))
+            {
+                log_error("Surface lock failed\n");
+                result.ResultSub(CResult::TEST_ERROR);
+                return result.Result();
+            }
+
+            size_t pitch = rect.Pitch / sizeof(T);
+            size_t lineSize = width * planeNum * sizeof(T);
+            T *ptr = static_cast<T *>(rect.pBits);
+
+            for (size_t y = 0; y < height; ++y)
+                memcpy(ptr + y * pitch,
+                       &bufferIn[frameIdx % FRAME_NUM][y * width * planeNum],
+                       lineSize);
+
+            (*dx9SurfaceSrc)->UnlockRect();
+#else
+            void *surfaceInfo = 0;
+            return TEST_NOT_IMPLEMENTED;
+#endif
+
+            error = clEnqueueAcquireDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueAcquireMediaSurfaceKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+                return result.Result();
+            }
+
+            size_t origin[3] = { 0, 0, 0 };
+            size_t region[3] = { width, height, 1 };
+
+            { // read operation
+                std::vector<T> out(planeNum * width * height, 0);
+                error =
+                    clEnqueueReadImage(cmdQueue, objectSrcShared, CL_TRUE,
+                                       origin, region, 0, 0, &out[0], 0, 0, 0);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clEnqueueReadImage failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                if (!DataCompare(surfaceFormat, format.image_channel_data_type,
+                                 out, bufferIn[frameIdx % FRAME_NUM], width,
+                                 height, planeNum))
+                {
+                    log_error("Frame idx: %i, OCL object is different then "
+                              "expected\n",
+                              frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // write operation
+                error = clEnqueueWriteImage(
+                    cmdQueue, objectSrcShared, CL_TRUE, origin, region, 0, 0,
+                    &bufferExp[frameIdx % FRAME_NUM][0], 0, 0, 0);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clEnqueueWriteImage failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // kernel operations
+                clSamplerWrapper sampler = clCreateSampler(
+                    ctx, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to create sampler\n");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                size_t threads[2] = { width, height };
+                clProgramWrapper program;
+                clKernelWrapper kernel;
+                const char *progPtr = PROGRAM_STR.c_str();
+                if (create_single_kernel_helper(ctx, &program, &kernel, 1,
+                                                (const char **)&progPtr,
+                                                "TestFunction"))
+                    result.ResultSub(CResult::TEST_FAIL);
+
+                error = clSetKernelArg(kernel, 0, sizeof(objectSrcShared),
+                                       &(objectSrcShared));
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to set kernel arguments");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error = clSetKernelArg(kernel, 1, sizeof(objectDstShared),
+                                       &(objectDstShared));
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to set kernel arguments");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error = clSetKernelArg(kernel, 2, sizeof(sampler), &sampler);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to set kernel arguments");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                size_t bufferSize = sizeof(cl_int) * 2;
+                clMemWrapper imageRes = clCreateBuffer(
+                    ctx, CL_MEM_READ_WRITE, bufferSize, NULL, &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clCreateBuffer failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error = clSetKernelArg(kernel, 3, sizeof(imageRes), &imageRes);
+
+                size_t localThreads[2];
+                error = get_max_common_2D_work_group_size(ctx, kernel, threads,
+                                                          localThreads);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to get work group size to use");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                error =
+                    clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, threads,
+                                           localThreads, 0, NULL, NULL);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to execute test kernel");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                std::vector<cl_uint> imageResOut(2, 0);
+                error = clEnqueueReadBuffer(cmdQueue, imageRes, CL_TRUE, 0,
+                                            bufferSize, &imageResOut[0], 0,
+                                            NULL, NULL);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("Unable to read buffer");
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                if (imageResOut[0] != width)
+                {
+                    log_error("Invalid width value, test = %i, expected = %i\n",
+                              imageResOut[0], width);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                if (imageResOut[1] != height)
+                {
+                    log_error(
+                        "Invalid height value, test = %i, expected = %i\n",
+                        imageResOut[1], height);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            { // map operation
+                size_t mapOrigin[3] = { 0, 0, 0 };
+                size_t mapRegion[3] = { width, height, 1 };
+
+                std::vector<T> out(width * height * planeNum, 0);
+                size_t rowPitch = 0;
+                size_t slicePitch = 0;
+                void *mapPtr = clEnqueueMapImage(
+                    cmdQueue, objectDstShared, CL_TRUE,
+                    CL_MAP_READ | CL_MAP_WRITE, mapOrigin, mapRegion, &rowPitch,
+                    &slicePitch, 0, 0, 0, &error);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clEnqueueMapImage failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                for (size_t y = 0; y < height; ++y)
+                    memcpy(&out[y * width * planeNum],
+                           static_cast<T *>(mapPtr) + y * rowPitch / sizeof(T),
+                           width * planeNum * sizeof(T));
+
+                if (!DataCompare(surfaceFormat, format.image_channel_data_type,
+                                 out, bufferIn[frameIdx % FRAME_NUM], width,
+                                 height, planeNum))
+                {
+                    log_error("Frame idx: %i, Mapped OCL object is different "
+                              "then expected\n",
+                              frameIdx);
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+
+                for (size_t y = 0; y < height; ++y)
+                    memcpy(
+                        static_cast<T *>(mapPtr) + y * rowPitch / sizeof(T),
+                        &bufferExp[frameIdx % FRAME_NUM][y * width * planeNum],
+                        width * planeNum * sizeof(T));
+
+                error = clEnqueueUnmapMemObject(cmdQueue, objectDstShared,
+                                                mapPtr, 0, 0, 0);
+                if (error != CL_SUCCESS)
+                {
+                    log_error("clEnqueueUnmapMemObject failed: %s\n",
+                              IGetErrorString(error));
+                    result.ResultSub(CResult::TEST_FAIL);
+                }
+            }
+
+            error = clEnqueueReleaseDX9MediaSurfacesKHR(
+                cmdQueue, static_cast<cl_uint>(memObjList.size()),
+                &memObjList[0], 0, 0, 0);
+            if (error != CL_SUCCESS)
+            {
+                log_error("clEnqueueReleaseMediaSurfaceKHR failed: %s\n",
+                          IGetErrorString(error));
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+
+            std::vector<T> out(width * height * planeNum, 0);
+            // surface get
+#if defined(_WIN32)
+            if (FAILED((*dx9SurfaceDst)->LockRect(&rect, NULL, 0)))
+            {
+                log_error("Surface lock failed\n");
+                result.ResultSub(CResult::TEST_ERROR);
+                return result.Result();
+            }
+
+            pitch = rect.Pitch / sizeof(T);
+            lineSize = width * planeNum * sizeof(T);
+            ptr = static_cast<T *>(rect.pBits);
+            for (size_t y = 0; y < height; ++y)
+                memcpy(&out[y * width * planeNum], ptr + y * pitch, lineSize);
+
+            (*dx9SurfaceDst)->UnlockRect();
+#else
+            return TEST_NOT_IMPLEMENTED;
+#endif
+
+            if (!DataCompare(surfaceFormat, format.image_channel_data_type, out,
+                             bufferExp[frameIdx % FRAME_NUM], width, height,
+                             planeNum))
+            {
+                log_error(
+                    "Frame idx: %i, media object is different then expected\n",
+                    frameIdx);
+                result.ResultSub(CResult::TEST_FAIL);
+            }
+        }
+    }
+
+    if (deviceWrapper->Status() != DEVICE_PASS)
+    {
+        std::string adapterName;
+        AdapterToString(adapterType, adapterName);
+        if (deviceWrapper->Status() == DEVICE_FAIL)
+        {
+            log_error("%s init failed\n", adapterName.c_str());
+            result.ResultSub(CResult::TEST_FAIL);
+        }
+        else
+        {
+            log_error("%s init incomplete due to unsupported device\n",
+                      adapterName.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return result.Result();
+}
+
+int test_other_data_types(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
+{
+    CResult result;
+
+#if defined(_WIN32)
+    // D3D9
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   64, 256, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_R32F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  256, 128, CL_ADAPTER_D3D9_KHR,
+                                  SURFACE_FORMAT_R16F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    512, 256, CL_ADAPTER_D3D9_KHR,
+                                    SURFACE_FORMAT_L16, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, L16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 512, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_A8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, A8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   1024, 32, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_L8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, L8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(
+            deviceID, context, queue, num_elements, 10, 32, 1024,
+            CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_G32R32F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, G32R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(
+            deviceID, context, queue, num_elements, 10, 64, 64,
+            CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_G16R16F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, G16R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(
+            deviceID, context, queue, num_elements, 10, 256, 256,
+            CL_ADAPTER_D3D9_KHR, SURFACE_FORMAT_G16R16, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, G16R16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 128, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_A8L8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, A8L8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   128, 512, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_A32B32G32R32F,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9, A32B32G32R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  128, 128, CL_ADAPTER_D3D9_KHR,
+                                  SURFACE_FORMAT_A16B16G16R16F,
+                                  SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9, A16B16G16R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    64, 128, CL_ADAPTER_D3D9_KHR,
+                                    SURFACE_FORMAT_A16B16G16R16,
+                                    SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9, A16B16G16R16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   128, 64, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_A8B8G8R8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, A8B8G8R8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   16, 512, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_X8B8G8R8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, X8B8G8R8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 16, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_A8R8G8B8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, A8R8G8B8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 256, CL_ADAPTER_D3D9_KHR,
+                                   SURFACE_FORMAT_X8R8G8B8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9, X8R8G8B8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // D3D9EX
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   64, 256, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_R32F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   64, 256, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_R32F, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, R32F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  256, 128, CL_ADAPTER_D3D9EX_KHR,
+                                  SURFACE_FORMAT_R16F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  256, 128, CL_ADAPTER_D3D9EX_KHR,
+                                  SURFACE_FORMAT_R16F, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, R16F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    512, 256, CL_ADAPTER_D3D9EX_KHR,
+                                    SURFACE_FORMAT_L16, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, L16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    512, 256, CL_ADAPTER_D3D9EX_KHR,
+                                    SURFACE_FORMAT_L16, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, L16, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 512, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, A8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 512, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, A8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   1024, 32, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_L8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, L8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   1024, 32, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_L8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, L8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   32, 1024, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_G32R32F,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, G32R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   32, 1024, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_G32R32F,
+                                   SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, G32R32F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  64, 64, CL_ADAPTER_D3D9EX_KHR,
+                                  SURFACE_FORMAT_G16R16F,
+                                  SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, G16R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  64, 64, CL_ADAPTER_D3D9EX_KHR,
+                                  SURFACE_FORMAT_G16R16F, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, G16R16F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    256, 256, CL_ADAPTER_D3D9EX_KHR,
+                                    SURFACE_FORMAT_G16R16,
+                                    SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, G16R16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(
+            deviceID, context, queue, num_elements, 10, 256, 256,
+            CL_ADAPTER_D3D9EX_KHR, SURFACE_FORMAT_G16R16, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, G16R16, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 128, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8L8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, A8L8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 128, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8L8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, A8L8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   128, 512, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A32B32G32R32F,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A32B32G32R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   128, 512, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A32B32G32R32F,
+                                   SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A32B32G32R32F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  128, 128, CL_ADAPTER_D3D9EX_KHR,
+                                  SURFACE_FORMAT_A16B16G16R16F,
+                                  SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A16B16G16R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  128, 128, CL_ADAPTER_D3D9EX_KHR,
+                                  SURFACE_FORMAT_A16B16G16R16F,
+                                  SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A16B16G16R16F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    64, 128, CL_ADAPTER_D3D9EX_KHR,
+                                    SURFACE_FORMAT_A16B16G16R16,
+                                    SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A16B16G16R16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    64, 128, CL_ADAPTER_D3D9EX_KHR,
+                                    SURFACE_FORMAT_A16B16G16R16,
+                                    SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A16B16G16R16, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   128, 64, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8B8G8R8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A8B8G8R8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   128, 64, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8B8G8R8,
+                                   SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, A8B8G8R8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   16, 512, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_X8B8G8R8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, X8B8G8R8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   16, 512, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_X8B8G8R8,
+                                   SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, X8B8G8R8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 16, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8R8G8B8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, A8R8G8B8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 16, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_A8R8G8B8,
+                                   SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, A8R8G8B8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 256, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_X8R8G8B8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (D3D9EX, X8R8G8B8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 256, CL_ADAPTER_D3D9EX_KHR,
+                                   SURFACE_FORMAT_X8R8G8B8,
+                                   SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (D3D9EX, X8R8G8B8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    // DXVA
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   64, 256, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_R32F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   64, 256, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_R32F, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, R32F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  256, 128, CL_ADAPTER_DXVA_KHR,
+                                  SURFACE_FORMAT_R16F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  256, 128, CL_ADAPTER_DXVA_KHR,
+                                  SURFACE_FORMAT_R16F, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, R16F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    512, 256, CL_ADAPTER_DXVA_KHR,
+                                    SURFACE_FORMAT_L16, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, L16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    512, 256, CL_ADAPTER_DXVA_KHR,
+                                    SURFACE_FORMAT_L16, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, L16, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 512, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 512, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   1024, 32, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_L8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, L8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   1024, 32, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_L8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, L8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(
+            deviceID, context, queue, num_elements, 10, 32, 1024,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_G32R32F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, G32R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(
+            deviceID, context, queue, num_elements, 10, 32, 1024,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_G32R32F, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, G32R32F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(
+            deviceID, context, queue, num_elements, 10, 64, 64,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_G16R16F, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, G16R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  64, 64, CL_ADAPTER_DXVA_KHR,
+                                  SURFACE_FORMAT_G16R16F, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, G16R16F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(
+            deviceID, context, queue, num_elements, 10, 256, 256,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_G16R16, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, G16R16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(
+            deviceID, context, queue, num_elements, 10, 256, 256,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_G16R16, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, G16R16, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 128, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A8L8, SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8L8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 128, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A8L8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8L8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   128, 512, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A32B32G32R32F,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (DXVA, A32B32G32R32F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_float>(deviceID, context, queue, num_elements, 10,
+                                   128, 512, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A32B32G32R32F,
+                                   SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (DXVA, A32B32G32R32F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  128, 128, CL_ADAPTER_DXVA_KHR,
+                                  SURFACE_FORMAT_A16B16G16R16F,
+                                  SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (DXVA, A16B16G16R16F, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_half>(deviceID, context, queue, num_elements, 10,
+                                  128, 128, CL_ADAPTER_DXVA_KHR,
+                                  SURFACE_FORMAT_A16B16G16R16F,
+                                  SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (DXVA, A16B16G16R16F, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    64, 128, CL_ADAPTER_DXVA_KHR,
+                                    SURFACE_FORMAT_A16B16G16R16,
+                                    SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error(
+            "\nTest case (DXVA, A16B16G16R16, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10,
+                                    64, 128, CL_ADAPTER_DXVA_KHR,
+                                    SURFACE_FORMAT_A16B16G16R16,
+                                    SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A16B16G16R16, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   128, 64, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A8B8G8R8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8B8G8R8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(
+            deviceID, context, queue, num_elements, 10, 128, 64,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_A8B8G8R8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8B8G8R8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   16, 512, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_X8B8G8R8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, X8B8G8R8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(
+            deviceID, context, queue, num_elements, 10, 16, 512,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_X8B8G8R8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, X8B8G8R8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   512, 16, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_A8R8G8B8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8R8G8B8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(
+            deviceID, context, queue, num_elements, 10, 512, 16,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_A8R8G8B8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, A8R8G8B8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10,
+                                   256, 256, CL_ADAPTER_DXVA_KHR,
+                                   SURFACE_FORMAT_X8R8G8B8,
+                                   SHARED_HANDLE_DISABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, X8R8G8B8, no shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+    if (other_data_types<cl_uchar>(
+            deviceID, context, queue, num_elements, 10, 256, 256,
+            CL_ADAPTER_DXVA_KHR, SURFACE_FORMAT_X8R8G8B8, SHARED_HANDLE_ENABLED)
+        != 0)
+    {
+        log_error("\nTest case (DXVA, X8R8G8B8, shared handle) failed\n\n");
+        result.ResultSub(CResult::TEST_FAIL);
+    }
+
+#else
+    return TEST_NOT_IMPLEMENTED;
+#endif
+
+    return result.Result();
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/utils.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/utils.cpp
new file mode 100644
index 0000000000..87eb13c3ca
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/utils.cpp
@@ -0,0 +1,1664 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "utils.h"
+
+#include "harness/errorHelpers.h"
+#include "harness/imageHelpers.h"
+#include "harness/rounding_mode.h"
+
+#include <math.h>
+
+#include <CL/cl_half.h>
+
+static RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;
+
+
+CResult::CResult(): _result(TEST_PASS), _resultLast(TEST_NORESULT) {}
+
+CResult::~CResult() {}
+
+CResult::TTestResult CResult::ResultLast() const { return _resultLast; }
+
+int CResult::Result() const
+{
+    switch (_result)
+    {
+        case TEST_NORESULT:
+        case TEST_NOTSUPPORTED:
+        case TEST_PASS: return 0; break;
+        case TEST_FAIL: return 1; break;
+        case TEST_ERROR: return 2; break;
+        default: return -1; break;
+    }
+}
+
+void CResult::ResultSub(TTestResult result)
+{
+    _resultLast = result;
+    if (static_cast<int>(result) > static_cast<int>(_result)) _result = result;
+}
+
+void FunctionContextCreateToString(TContextFuncType contextCreateFunction,
+                                   std::string &contextFunction)
+{
+    switch (contextCreateFunction)
+    {
+        case CONTEXT_CREATE_DEFAULT: contextFunction = "CreateContext"; break;
+        case CONTEXT_CREATE_FROM_TYPE:
+            contextFunction = "CreateContextFromType";
+            break;
+        default:
+            contextFunction = "Unknown";
+            log_error("FunctionContextCreateToString(): Unknown create "
+                      "function enum!");
+            break;
+    }
+}
+
+void AdapterToString(cl_dx9_media_adapter_type_khr adapterType,
+                     std::string &adapter)
+{
+    switch (adapterType)
+    {
+        case CL_ADAPTER_D3D9_KHR: adapter = "D3D9"; break;
+        case CL_ADAPTER_D3D9EX_KHR: adapter = "D3D9EX"; break;
+        case CL_ADAPTER_DXVA_KHR: adapter = "DXVA"; break;
+        default:
+            adapter = "Unknown";
+            log_error("AdapterToString(): Unknown adapter type!");
+            break;
+    }
+}
+
+cl_context_info
+AdapterTypeToContextInfo(cl_dx9_media_adapter_type_khr adapterType)
+{
+    switch (adapterType)
+    {
+        case CL_ADAPTER_D3D9_KHR: return CL_CONTEXT_ADAPTER_D3D9_KHR; break;
+        case CL_ADAPTER_D3D9EX_KHR: return CL_CONTEXT_ADAPTER_D3D9EX_KHR; break;
+        case CL_ADAPTER_DXVA_KHR: return CL_CONTEXT_ADAPTER_DXVA_KHR; break;
+        default:
+            log_error("AdapterTypeToContextInfo(): Unknown adapter type!");
+            return 0;
+            break;
+    }
+}
+
+void YUVGenerateNV12(std::vector<cl_uchar> &yuv, unsigned int width,
+                     unsigned int height, cl_uchar valueMin, cl_uchar valueMax,
+                     double valueAdd)
+{
+    yuv.clear();
+    yuv.resize(width * height * 3 / 2, 0);
+
+    double min = static_cast<double>(valueMin);
+    double max = static_cast<double>(valueMax);
+    double range = 255;
+    double add = static_cast<double>(valueAdd * range);
+    double stepX = (max - min) / static_cast<double>(width);
+    double stepY = (max - min) / static_cast<double>(height);
+
+    // generate Y plane
+    for (unsigned int i = 0; i < height; ++i)
+    {
+        unsigned int offset = i * width;
+        double valueYPlane0 = static_cast<double>(stepY * i);
+        for (unsigned int j = 0; j < width; ++j)
+        {
+            double valueXPlane0 = static_cast<double>(stepX * j);
+            yuv.at(offset + j) = static_cast<cl_uchar>(
+                min + valueXPlane0 / 2 + valueYPlane0 / 2 + add);
+        }
+    }
+
+    // generate UV planes
+    for (unsigned int i = 0; i < height / 2; ++i)
+    {
+        unsigned int offset = width * height + i * width;
+        double valueYPlane1 = static_cast<double>(stepY * i);
+        double valueYPlane2 = static_cast<double>(stepY * (height / 2 + i));
+        for (unsigned int j = 0; j < width / 2; ++j)
+        {
+            double valueXPlane1 = static_cast<double>(stepX * j);
+            double valueXPlane2 = static_cast<double>(stepX * (width / 2 + j));
+
+            yuv.at(offset + j * 2) = static_cast<cl_uchar>(
+                min + valueXPlane1 / 2 + valueYPlane1 / 2 + add);
+            yuv.at(offset + j * 2 + 1) = static_cast<cl_uchar>(
+                min + valueXPlane2 / 2 + valueYPlane2 / 2 + add);
+        }
+    }
+}
+
+void YUVGenerateYV12(std::vector<cl_uchar> &yuv, unsigned int width,
+                     unsigned int height, cl_uchar valueMin, cl_uchar valueMax,
+                     double valueAdd /*= 0.0*/)
+{
+    yuv.clear();
+    yuv.resize(width * height * 3 / 2, 0);
+
+    double min = static_cast<double>(valueMin);
+    double max = static_cast<double>(valueMax);
+    double range = 255;
+    double add = static_cast<double>(valueAdd * range);
+    double stepX = (max - min) / static_cast<double>(width);
+    double stepY = (max - min) / static_cast<double>(height);
+
+    unsigned offset = 0;
+
+    // generate Y plane
+    for (unsigned int i = 0; i < height; ++i)
+    {
+        unsigned int plane0Offset = offset + i * width;
+        double valueYPlane0 = static_cast<double>(stepY * i);
+        for (unsigned int j = 0; j < width; ++j)
+        {
+            double valueXPlane0 = static_cast<double>(stepX * j);
+            yuv.at(plane0Offset + j) = static_cast<cl_uchar>(
+                min + valueXPlane0 / 2 + valueYPlane0 / 2 + add);
+        }
+    }
+
+    // generate V plane
+    offset += width * height;
+    for (unsigned int i = 0; i < height / 2; ++i)
+    {
+        unsigned int plane1Offset = offset + i * width / 2;
+        double valueYPlane1 = static_cast<double>(stepY * i);
+        for (unsigned int j = 0; j < width / 2; ++j)
+        {
+            double valueXPlane1 = static_cast<double>(stepX * j);
+            yuv.at(plane1Offset + j) = static_cast<cl_uchar>(
+                min + valueXPlane1 / 2 + valueYPlane1 / 2 + add);
+        }
+    }
+
+    // generate U plane
+    offset += width * height / 4;
+    for (unsigned int i = 0; i < height / 2; ++i)
+    {
+        unsigned int plane2Offset = offset + i * width / 2;
+        double valueYPlane2 = static_cast<double>(stepY * (height / 2 + i));
+        for (unsigned int j = 0; j < width / 2; ++j)
+        {
+            double valueXPlane2 = static_cast<double>(stepX * j);
+            yuv.at(plane2Offset + j) = static_cast<cl_uchar>(
+                min + valueXPlane2 / 2 + valueYPlane2 / 2 + add);
+        }
+    }
+}
+
+
+bool YUVGenerate(TSurfaceFormat surfaceFormat, std::vector<cl_uchar> &yuv,
+                 unsigned int width, unsigned int height, cl_uchar valueMin,
+                 cl_uchar valueMax, double valueAdd /*= 0.0*/)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_NV12:
+            YUVGenerateNV12(yuv, width, height, valueMin, valueMax, valueAdd);
+            break;
+        case SURFACE_FORMAT_YV12:
+            YUVGenerateYV12(yuv, width, height, valueMin, valueMax, valueAdd);
+            break;
+        default:
+            log_error("YUVGenerate(): Invalid surface type\n");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+bool YUVSurfaceSetNV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       const std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height)
+{
+#if defined(_WIN32)
+    CD3D9SurfaceWrapper *d3dSurface =
+        static_cast<CD3D9SurfaceWrapper *>(surface.get());
+    D3DLOCKED_RECT rect;
+    if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
+    {
+        log_error("YUVSurfaceSetNV12(): Surface lock failed\n");
+        return false;
+    }
+
+    size_t pitch = rect.Pitch / sizeof(cl_uchar);
+    size_t lineSize = width * sizeof(cl_uchar);
+    cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
+    for (size_t y = 0; y < height; ++y)
+        memcpy(ptr + y * pitch, &yuv.at(y * width), lineSize);
+
+    for (size_t y = 0; y < height / 2; ++y)
+        memcpy(ptr + height * pitch + y * pitch,
+               &yuv.at(width * height + y * width), lineSize);
+
+    (*d3dSurface)->UnlockRect();
+
+    return true;
+
+#else
+    return false;
+#endif
+}
+
+bool YUVSurfaceSetYV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       const std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height)
+{
+#if defined(_WIN32)
+    CD3D9SurfaceWrapper *d3dSurface =
+        static_cast<CD3D9SurfaceWrapper *>(surface.get());
+    D3DLOCKED_RECT rect;
+    if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
+    {
+        log_error("YUVSurfaceSetYV12(): Surface lock failed!\n");
+        return false;
+    }
+
+    size_t pitch = rect.Pitch / sizeof(cl_uchar);
+    size_t pitchHalf = pitch / 2;
+    size_t lineSize = width * sizeof(cl_uchar);
+    size_t lineHalfSize = lineSize / 2;
+    size_t surfaceOffset = 0;
+    size_t yuvOffset = 0;
+    cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
+
+    for (size_t y = 0; y < height; ++y)
+        memcpy(ptr + surfaceOffset + y * pitch, &yuv.at(yuvOffset + y * width),
+               lineSize);
+
+    surfaceOffset += height * pitch;
+    yuvOffset += width * height;
+    for (size_t y = 0; y < height / 2; ++y)
+        memcpy(ptr + surfaceOffset + y * pitchHalf,
+               &yuv.at(yuvOffset + y * lineHalfSize), lineHalfSize);
+
+    surfaceOffset += pitchHalf * height / 2;
+    yuvOffset += width * height / 4;
+    for (size_t y = 0; y < height / 2; ++y)
+        memcpy(ptr + surfaceOffset + y * pitchHalf,
+               &yuv.at(yuvOffset + y * lineHalfSize), lineHalfSize);
+
+    (*d3dSurface)->UnlockRect();
+
+    return true;
+
+#else
+    return false;
+#endif
+}
+
+bool YUVSurfaceSet(TSurfaceFormat surfaceFormat,
+                   std::auto_ptr<CSurfaceWrapper> &surface,
+                   const std::vector<cl_uchar> &yuv, unsigned int width,
+                   unsigned int height)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_NV12:
+            if (!YUVSurfaceSetNV12(surface, yuv, width, height)) return false;
+            break;
+        case SURFACE_FORMAT_YV12:
+            if (!YUVSurfaceSetYV12(surface, yuv, width, height)) return false;
+            break;
+        default:
+            log_error("YUVSurfaceSet(): Invalid surface type!\n");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+bool YUVSurfaceGetNV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height)
+{
+#if defined(_WIN32)
+    CD3D9SurfaceWrapper *d3dSurface =
+        static_cast<CD3D9SurfaceWrapper *>(surface.get());
+    D3DLOCKED_RECT rect;
+    if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
+    {
+        log_error("YUVSurfaceGetNV12(): Surface lock failed!\n");
+        return false;
+    }
+
+    size_t pitch = rect.Pitch / sizeof(cl_uchar);
+    size_t lineSize = width * sizeof(cl_uchar);
+    cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
+    size_t yuvOffset = 0;
+    size_t surfaceOffset = 0;
+    for (size_t y = 0; y < height; ++y)
+        memcpy(&yuv.at(yuvOffset + y * width), ptr + y * pitch, lineSize);
+
+    yuvOffset += width * height;
+    surfaceOffset += pitch * height;
+    for (size_t y = 0; y < height / 2; ++y)
+        memcpy(&yuv.at(yuvOffset + y * width), ptr + surfaceOffset + y * pitch,
+               lineSize);
+
+    (*d3dSurface)->UnlockRect();
+
+    return true;
+
+#else
+    return false;
+#endif
+}
+
+bool YUVSurfaceGetYV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height)
+{
+#if defined(_WIN32)
+    CD3D9SurfaceWrapper *d3dSurface =
+        static_cast<CD3D9SurfaceWrapper *>(surface.get());
+    D3DLOCKED_RECT rect;
+    if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
+    {
+        log_error("YUVSurfaceGetYV12(): Surface lock failed!\n");
+        return false;
+    }
+
+    size_t pitch = rect.Pitch / sizeof(cl_uchar);
+    size_t pitchHalf = pitch / 2;
+    size_t lineSize = width * sizeof(cl_uchar);
+    size_t lineHalfSize = lineSize / 2;
+    size_t surfaceOffset = 0;
+    size_t yuvOffset = 0;
+    cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
+
+    for (size_t y = 0; y < height; ++y)
+        memcpy(&yuv.at(yuvOffset + y * width), ptr + surfaceOffset + y * pitch,
+               lineSize);
+
+    surfaceOffset += pitch * height;
+    yuvOffset += width * height;
+    for (size_t y = 0; y < height / 2; ++y)
+        memcpy(&yuv.at(yuvOffset + y * lineHalfSize),
+               ptr + surfaceOffset + y * pitchHalf, lineHalfSize);
+
+    surfaceOffset += pitchHalf * height / 2;
+    yuvOffset += width * height / 4;
+    for (size_t y = 0; y < height / 2; ++y)
+        memcpy(&yuv.at(yuvOffset + y * lineHalfSize),
+               ptr + surfaceOffset + y * pitchHalf, lineHalfSize);
+
+    (*d3dSurface)->UnlockRect();
+
+    return true;
+
+#else
+    return false;
+#endif
+}
+
+bool YUVSurfaceGet(TSurfaceFormat surfaceFormat,
+                   std::auto_ptr<CSurfaceWrapper> &surface,
+                   std::vector<cl_uchar> &yuv, unsigned int width,
+                   unsigned int height)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_NV12:
+            if (!YUVSurfaceGetNV12(surface, yuv, width, height)) return false;
+            break;
+        case SURFACE_FORMAT_YV12:
+            if (!YUVSurfaceGetYV12(surface, yuv, width, height)) return false;
+            break;
+        default:
+            log_error("YUVSurfaceGet(): Invalid surface type!\n");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+bool YUVCompareNV12(const std::vector<cl_uchar> &yuvTest,
+                    const std::vector<cl_uchar> &yuvRef, unsigned int width,
+                    unsigned int height)
+{
+    // plane 0 verification
+    size_t offset = 0;
+    for (size_t y = 0; y < height; ++y)
+    {
+        size_t plane0Offset = offset + width * y;
+        for (size_t x = 0; x < width; ++x)
+        {
+            if (yuvTest[plane0Offset + x] != yuvRef[plane0Offset + x])
+            {
+                log_error("Plane 0 (Y) is different than expected, reference "
+                          "value: %i, test value: %i, x: %i, y: %i\n",
+                          yuvRef[plane0Offset + x], yuvTest[plane0Offset + x],
+                          x, y);
+                return false;
+            }
+        }
+    }
+
+    // plane 1 and 2 verification
+    offset += width * height;
+    for (size_t y = 0; y < height / 2; ++y)
+    {
+        size_t plane12Offset = offset + width * y;
+        for (size_t x = 0; x < width / 2; ++x)
+        {
+            if (yuvTest.at(plane12Offset + 2 * x)
+                != yuvRef.at(plane12Offset + 2 * x))
+            {
+                log_error("Plane 1 (U) is different than expected, reference "
+                          "value: %i, test value: %i, x: %i, y: %i\n",
+                          yuvRef[plane12Offset + 2 * x],
+                          yuvTest[plane12Offset + 2 * x], x, y);
+                return false;
+            }
+
+            if (yuvTest.at(plane12Offset + 2 * x + 1)
+                != yuvRef.at(plane12Offset + 2 * x + 1))
+            {
+                log_error("Plane 2 (V) is different than expected, reference "
+                          "value: %i, test value: %i, x: %i, y: %i\n",
+                          yuvRef[plane12Offset + 2 * x + 1],
+                          yuvTest[plane12Offset + 2 * x + 1], x, y);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool YUVCompareYV12(const std::vector<cl_uchar> &yuvTest,
+                    const std::vector<cl_uchar> &yuvRef, unsigned int width,
+                    unsigned int height)
+{
+    // plane 0 verification
+    size_t offset = 0;
+    for (size_t y = 0; y < height; ++y)
+    {
+        size_t plane0Offset = width * y;
+        for (size_t x = 0; x < width; ++x)
+        {
+            if (yuvTest.at(plane0Offset + x) != yuvRef.at(plane0Offset + x))
+            {
+                log_error("Plane 0 (Y) is different than expected, reference "
+                          "value: %i, test value: %i, x: %i, y: %i\n",
+                          yuvRef[plane0Offset + x], yuvTest[plane0Offset + x],
+                          x, y);
+                return false;
+            }
+        }
+    }
+
+    // plane 1 verification
+    offset += width * height;
+    for (size_t y = 0; y < height / 2; ++y)
+    {
+        size_t plane1Offset = offset + width * y / 2;
+        for (size_t x = 0; x < width / 2; ++x)
+        {
+            if (yuvTest.at(plane1Offset + x) != yuvRef.at(plane1Offset + x))
+            {
+                log_error("Plane 1 (V) is different than expected, reference "
+                          "value: %i, test value: %i, x: %i, y: %i\n",
+                          yuvRef[plane1Offset + x], yuvTest[plane1Offset + x],
+                          x, y);
+                return false;
+            }
+        }
+    }
+
+    // plane 2 verification
+    offset += width * height / 4;
+    for (size_t y = 0; y < height / 2; ++y)
+    {
+        size_t plane2Offset = offset + width * y / 2;
+        for (size_t x = 0; x < width / 2; ++x)
+        {
+            if (yuvTest.at(plane2Offset + x) != yuvRef.at(plane2Offset + x))
+            {
+                log_error("Plane 2 (U) is different than expected, reference "
+                          "value: %i, test value: %i, x: %i, y: %i\n",
+                          yuvRef[plane2Offset + x], yuvTest[plane2Offset + x],
+                          x, y);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool YUVCompare(TSurfaceFormat surfaceFormat,
+                const std::vector<cl_uchar> &yuvTest,
+                const std::vector<cl_uchar> &yuvRef, unsigned int width,
+                unsigned int height)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_NV12:
+            if (!YUVCompareNV12(yuvTest, yuvRef, width, height))
+            {
+                log_error("OCL object is different than expected!\n");
+                return false;
+            }
+            break;
+        case SURFACE_FORMAT_YV12:
+            if (!YUVCompareYV12(yuvTest, yuvRef, width, height))
+            {
+                log_error("OCL object is different than expected!\n");
+                return false;
+            }
+            break;
+        default:
+            log_error("YUVCompare(): Invalid surface type!\n");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                  std::vector<float> &data, unsigned int width,
+                  unsigned int height, unsigned int channelNum,
+                  float cmin /*= 0.0f*/, float cmax /*= 1.0f*/,
+                  float add /*= 0.0f*/)
+{
+    data.clear();
+    data.reserve(width * height * channelNum);
+
+    double valueMin = static_cast<double>(cmin);
+    double valueMax = static_cast<double>(cmax);
+    double stepX = (valueMax - valueMin) / static_cast<double>(width);
+    double stepY = (valueMax - valueMin) / static_cast<double>(height);
+    double valueAdd = static_cast<double>(add);
+    for (unsigned int i = 0; i < height; ++i)
+    {
+        double valueY = static_cast<double>(stepY * i);
+        for (unsigned int j = 0; j < width; ++j)
+        {
+            double valueX = static_cast<double>(stepX * j);
+            switch (channelNum)
+            {
+                case 1:
+                    data.push_back(static_cast<float>(valueMin + valueX / 2
+                                                      + valueY / 2 + valueAdd));
+                    break;
+                case 2:
+                    data.push_back(
+                        static_cast<float>(valueMin + valueX + valueAdd));
+                    data.push_back(
+                        static_cast<float>(valueMin + valueY + valueAdd));
+                    break;
+                case 4:
+                    data.push_back(
+                        static_cast<float>(valueMin + valueX + valueAdd));
+                    data.push_back(
+                        static_cast<float>(valueMin + valueY + valueAdd));
+                    data.push_back(
+                        static_cast<float>(valueMin + valueX / 2 + valueAdd));
+                    data.push_back(
+                        static_cast<float>(valueMin + valueY / 2 + valueAdd));
+                    break;
+                default:
+                    log_error("DataGenerate(): invalid channel number!");
+                    return;
+                    break;
+            }
+        }
+    }
+}
+
+void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                  std::vector<cl_half> &data, unsigned int width,
+                  unsigned int height, unsigned int channelNum,
+                  float cmin /*= 0.0f*/, float cmax /*= 1.0f*/,
+                  float add /*= 0.0f*/)
+{
+    data.clear();
+    data.reserve(width * height * channelNum);
+
+    double valueMin = static_cast<double>(cmin);
+    double valueMax = static_cast<double>(cmax);
+    double stepX = (valueMax - valueMin) / static_cast<double>(width);
+    double stepY = (valueMax - valueMin) / static_cast<double>(height);
+
+    switch (type)
+    {
+        case CL_HALF_FLOAT: {
+            double valueAdd = static_cast<double>(add);
+
+            for (unsigned int i = 0; i < height; ++i)
+            {
+                double valueY = static_cast<double>(stepY * i);
+                for (unsigned int j = 0; j < width; ++j)
+                {
+                    double valueX = static_cast<double>(stepX * j);
+                    switch (channelNum)
+                    {
+                        case 1:
+                            data.push_back(convert_float_to_half(
+                                static_cast<float>(valueMin + valueX / 2
+                                                   + valueY / 2 + valueAdd)));
+                            break;
+                        case 2:
+                            data.push_back(
+                                convert_float_to_half(static_cast<float>(
+                                    valueMin + valueX + valueAdd)));
+                            data.push_back(
+                                convert_float_to_half(static_cast<float>(
+                                    valueMin + valueY + valueAdd)));
+                            break;
+                        case 4:
+                            data.push_back(
+                                convert_float_to_half(static_cast<float>(
+                                    valueMin + valueX + valueAdd)));
+                            data.push_back(
+                                convert_float_to_half(static_cast<float>(
+                                    valueMin + valueY + valueAdd)));
+                            data.push_back(
+                                convert_float_to_half(static_cast<float>(
+                                    valueMin + valueX / 2 + valueAdd)));
+                            data.push_back(
+                                convert_float_to_half(static_cast<float>(
+                                    valueMin + valueY / 2 + valueAdd)));
+                            break;
+                        default:
+                            log_error(
+                                "DataGenerate(): invalid channel number!");
+                            return;
+                            break;
+                    }
+                }
+            }
+            break;
+        }
+        case CL_UNORM_INT16: {
+            double range = 65535;
+            double valueAdd = static_cast<double>(add * range);
+
+            for (unsigned int i = 0; i < height; ++i)
+            {
+                double valueY = static_cast<double>(stepY * i * range);
+                for (unsigned int j = 0; j < width; ++j)
+                {
+                    double valueX = static_cast<double>(stepX * j * range);
+                    switch (channelNum)
+                    {
+                        case 1:
+                            data.push_back(static_cast<cl_ushort>(
+                                valueMin + valueX / 2 + valueY / 2 + valueAdd));
+                            break;
+                        case 2:
+                            data.push_back(static_cast<cl_ushort>(
+                                valueMin + valueX + valueAdd));
+                            data.push_back(static_cast<cl_ushort>(
+                                valueMin + valueY + valueAdd));
+                            break;
+                        case 4:
+                            data.push_back(static_cast<cl_ushort>(
+                                valueMin + valueX + valueAdd));
+                            data.push_back(static_cast<cl_ushort>(
+                                valueMin + valueY + valueAdd));
+                            data.push_back(static_cast<cl_ushort>(
+                                valueMin + valueX / 2 + valueAdd));
+                            data.push_back(static_cast<cl_ushort>(
+                                valueMin + valueY / 2 + valueAdd));
+                            break;
+                        default:
+                            log_error(
+                                "DataGenerate(): invalid channel number!");
+                            return;
+                            break;
+                    }
+                }
+            }
+        }
+        break;
+        default:
+            log_error("DataGenerate(): unknown data type!");
+            return;
+            break;
+    }
+}
+
+void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                  std::vector<cl_uchar> &data, unsigned int width,
+                  unsigned int height, unsigned int channelNum,
+                  float cmin /*= 0.0f*/, float cmax /*= 1.0f*/,
+                  float add /*= 0.0f*/)
+{
+    data.clear();
+    data.reserve(width * height * channelNum);
+
+    double valueMin = static_cast<double>(cmin);
+    double valueMax = static_cast<double>(cmax);
+    double stepX = (valueMax - valueMin) / static_cast<double>(width);
+    double stepY = (valueMax - valueMin) / static_cast<double>(height);
+
+    double range = 255;
+    double valueAdd = static_cast<double>(add * range);
+
+    for (unsigned int i = 0; i < height; ++i)
+    {
+        double valueY = static_cast<double>(stepY * i * range);
+        for (unsigned int j = 0; j < width; ++j)
+        {
+            double valueX = static_cast<double>(stepX * j * range);
+            switch (channelNum)
+            {
+                case 1:
+                    data.push_back(static_cast<cl_uchar>(
+                        valueMin + valueX / 2 + valueY / 2 + valueAdd));
+                    break;
+                case 2:
+                    data.push_back(
+                        static_cast<cl_uchar>(valueMin + valueX + valueAdd));
+                    data.push_back(
+                        static_cast<cl_uchar>(valueMin + valueY + valueAdd));
+                    break;
+                case 4:
+                    data.push_back(
+                        static_cast<cl_uchar>(valueMin + valueX + valueAdd));
+                    data.push_back(
+                        static_cast<cl_uchar>(valueMin + valueY + valueAdd));
+                    data.push_back(static_cast<cl_uchar>(valueMin + valueX / 2
+                                                         + valueAdd));
+                    if (surfaceFormat == SURFACE_FORMAT_X8R8G8B8)
+                        data.push_back(static_cast<cl_uchar>(0xff));
+                    else
+                        data.push_back(static_cast<cl_uchar>(
+                            valueMin + valueY / 2 + valueAdd));
+                    break;
+                default:
+                    log_error("DataGenerate(): invalid channel number!");
+                    return;
+                    break;
+            }
+        }
+    }
+}
+
+bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                 const std::vector<float> &dataTest,
+                 const std::vector<float> &dataExp, unsigned int width,
+                 unsigned int height, unsigned int channelNum)
+{
+    float epsilon = 0.000001f;
+    for (unsigned int i = 0; i < height; ++i)
+    {
+        unsigned int offset = i * width * channelNum;
+        for (unsigned int j = 0; j < width; ++j)
+        {
+            for (unsigned planeIdx = 0; planeIdx < channelNum; ++planeIdx)
+            {
+                if (abs(dataTest.at(offset + j * channelNum + planeIdx)
+                        - dataExp.at(offset + j * channelNum + planeIdx))
+                    > epsilon)
+                {
+                    log_error(
+                        "Tested image is different than reference (x,y,plane) "
+                        "= (%i,%i,%i), test value = %f, expected value = %f\n",
+                        j, i, planeIdx,
+                        dataTest[offset + j * channelNum + planeIdx],
+                        dataExp[offset + j * channelNum + planeIdx]);
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                 const std::vector<cl_half> &dataTest,
+                 const std::vector<cl_half> &dataExp, unsigned int width,
+                 unsigned int height, unsigned int channelNum)
+{
+    switch (type)
+    {
+        case CL_HALF_FLOAT: {
+            float epsilon = 0.001f;
+            for (unsigned int i = 0; i < height; ++i)
+            {
+                unsigned int offset = i * width * channelNum;
+                for (unsigned int j = 0; j < width; ++j)
+                {
+                    for (unsigned planeIdx = 0; planeIdx < channelNum;
+                         ++planeIdx)
+                    {
+                        float test = cl_half_to_float(
+                            dataTest.at(offset + j * channelNum + planeIdx));
+                        float ref = cl_half_to_float(
+                            dataExp.at(offset + j * channelNum + planeIdx));
+                        if (abs(test - ref) > epsilon)
+                        {
+                            log_error("Tested image is different than "
+                                      "reference (x,y,plane) = "
+                                      "(%i,%i,%i), test value = %f, expected "
+                                      "value = %f\n",
+                                      j, i, planeIdx, test, ref);
+                            return false;
+                        }
+                    }
+                }
+            }
+        }
+        break;
+        case CL_UNORM_INT16: {
+            cl_ushort epsilon = 1;
+            for (unsigned int i = 0; i < height; ++i)
+            {
+                unsigned int offset = i * width * channelNum;
+                for (unsigned int j = 0; j < width; ++j)
+                {
+                    for (unsigned planeIdx = 0; planeIdx < channelNum;
+                         ++planeIdx)
+                    {
+                        cl_ushort test =
+                            dataTest.at(offset + j * channelNum + planeIdx);
+                        cl_ushort ref =
+                            dataExp.at(offset + j * channelNum + planeIdx);
+                        if (abs(test - ref) > epsilon)
+                        {
+                            log_error("Tested image is different than "
+                                      "reference (x,y,plane) = (%i,%i,%i), "
+                                      "test value = %i, expected value = %i\n",
+                                      j, i, planeIdx, test, ref);
+                            return false;
+                        }
+                    }
+                }
+            }
+        }
+        break;
+        default:
+            log_error("DataCompare(): Invalid data format!");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                 const std::vector<cl_uchar> &dataTest,
+                 const std::vector<cl_uchar> &dataExp, unsigned int width,
+                 unsigned int height, unsigned int planeNum)
+{
+    for (unsigned int i = 0; i < height; ++i)
+    {
+        unsigned int offset = i * width * planeNum;
+        for (unsigned int j = 0; j < width; ++j)
+        {
+            for (unsigned planeIdx = 0; planeIdx < planeNum; ++planeIdx)
+            {
+                if (surfaceFormat == SURFACE_FORMAT_X8R8G8B8 && planeIdx == 3)
+                    continue;
+
+                cl_uchar test = dataTest.at(offset + j * planeNum + planeIdx);
+                cl_uchar ref = dataExp.at(offset + j * planeNum + planeIdx);
+                if (test != ref)
+                {
+                    log_error(
+                        "Tested image is different than reference (x,y,plane) "
+                        "= (%i,%i,%i), test value = %i, expected value = %i\n",
+                        j, i, planeIdx, test, ref);
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+bool GetImageInfo(cl_mem object, cl_image_format formatExp,
+                  size_t elementSizeExp, size_t rowPitchExp,
+                  size_t slicePitchExp, size_t widthExp, size_t heightExp,
+                  size_t depthExp, unsigned int planeExp)
+{
+    bool result = true;
+
+    cl_image_format format;
+    if (clGetImageInfo(object, CL_IMAGE_FORMAT, sizeof(cl_image_format),
+                       &format, 0)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_FORMAT) failed\n");
+        result = false;
+    }
+
+    if (formatExp.image_channel_order != format.image_channel_order
+        || formatExp.image_channel_data_type != format.image_channel_data_type)
+    {
+        log_error("Value of CL_IMAGE_FORMAT is different than expected\n");
+        result = false;
+    }
+
+    size_t elementSize = 0;
+    if (clGetImageInfo(object, CL_IMAGE_ELEMENT_SIZE, sizeof(size_t),
+                       &elementSize, 0)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_ELEMENT_SIZE) failed\n");
+        result = false;
+    }
+
+    if (elementSizeExp != elementSize)
+    {
+        log_error("Value of CL_IMAGE_ELEMENT_SIZE is different than expected "
+                  "(size: %i, exp size: %i)\n",
+                  elementSize, elementSizeExp);
+        result = false;
+    }
+
+    size_t rowPitch = 0;
+    if (clGetImageInfo(object, CL_IMAGE_ROW_PITCH, sizeof(size_t), &rowPitch, 0)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_ROW_PITCH) failed\n");
+        result = false;
+    }
+
+    if ((rowPitchExp == 0 && rowPitchExp != rowPitch)
+        || (rowPitchExp > 0 && rowPitchExp > rowPitch))
+    {
+        log_error("Value of CL_IMAGE_ROW_PITCH is different than expected "
+                  "(size: %i, exp size: %i)\n",
+                  rowPitch, rowPitchExp);
+        result = false;
+    }
+
+    size_t slicePitch = 0;
+    if (clGetImageInfo(object, CL_IMAGE_SLICE_PITCH, sizeof(size_t),
+                       &slicePitch, 0)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_SLICE_PITCH) failed\n");
+        result = false;
+    }
+
+    if ((slicePitchExp == 0 && slicePitchExp != slicePitch)
+        || (slicePitchExp > 0 && slicePitchExp > slicePitch))
+    {
+        log_error("Value of CL_IMAGE_SLICE_PITCH is different than expected "
+                  "(size: %i, exp size: %i)\n",
+                  slicePitch, slicePitchExp);
+        result = false;
+    }
+
+    size_t width = 0;
+    if (clGetImageInfo(object, CL_IMAGE_WIDTH, sizeof(size_t), &width, 0)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_WIDTH) failed\n");
+        result = false;
+    }
+
+    if (widthExp != width)
+    {
+        log_error("Value of CL_IMAGE_WIDTH is different than expected (size: "
+                  "%i, exp size: %i)\n",
+                  width, widthExp);
+        result = false;
+    }
+
+    size_t height = 0;
+    if (clGetImageInfo(object, CL_IMAGE_HEIGHT, sizeof(size_t), &height, 0)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_HEIGHT) failed\n");
+        result = false;
+    }
+
+    if (heightExp != height)
+    {
+        log_error("Value of CL_IMAGE_HEIGHT is different than expected (size: "
+                  "%i, exp size: %i)\n",
+                  height, heightExp);
+        result = false;
+    }
+
+    size_t depth = 0;
+    if (clGetImageInfo(object, CL_IMAGE_DEPTH, sizeof(size_t), &depth, 0)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_DEPTH) failed\n");
+        result = false;
+    }
+
+    if (depthExp != depth)
+    {
+        log_error("Value of CL_IMAGE_DEPTH is different than expected (size: "
+                  "%i, exp size: %i)\n",
+                  depth, depthExp);
+        result = false;
+    }
+
+    unsigned int plane = 99;
+    size_t paramSize = 0;
+    if (clGetImageInfo(object, CL_IMAGE_DX9_MEDIA_PLANE_KHR,
+                       sizeof(unsigned int), &plane, &paramSize)
+        != CL_SUCCESS)
+    {
+        log_error("clGetImageInfo(CL_IMAGE_MEDIA_SURFACE_PLANE_KHR) failed\n");
+        result = false;
+    }
+
+    if (planeExp != plane)
+    {
+        log_error("Value of CL_IMAGE_MEDIA_SURFACE_PLANE_KHR is different than "
+                  "expected (plane: %i, exp plane: %i)\n",
+                  plane, planeExp);
+        result = false;
+    }
+
+    return result;
+}
+
+bool GetMemObjInfo(cl_mem object, cl_dx9_media_adapter_type_khr adapterType,
+                   std::auto_ptr<CSurfaceWrapper> &surface,
+                   void *shareHandleExp)
+{
+    bool result = true;
+    switch (adapterType)
+    {
+        case CL_ADAPTER_D3D9_KHR:
+        case CL_ADAPTER_D3D9EX_KHR:
+        case CL_ADAPTER_DXVA_KHR: {
+#if defined(_WIN32)
+            cl_dx9_surface_info_khr surfaceInfo;
+#else
+            void *surfaceInfo = 0;
+            return false;
+#endif
+            size_t paramSize = 0;
+            if (clGetMemObjectInfo(object, CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR,
+                                   sizeof(surfaceInfo), &surfaceInfo,
+                                   &paramSize)
+                != CL_SUCCESS)
+            {
+                log_error("clGetImageInfo(CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR) "
+                          "failed\n");
+                result = false;
+            }
+
+#if defined(_WIN32)
+            CD3D9SurfaceWrapper *d3d9Surface =
+                static_cast<CD3D9SurfaceWrapper *>(surface.get());
+            if (*d3d9Surface != surfaceInfo.resource)
+            {
+                log_error(
+                    "Invalid resource for CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR\n");
+                result = false;
+            }
+
+            if (shareHandleExp != surfaceInfo.shared_handle)
+            {
+                log_error("Invalid shared handle for "
+                          "CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR\n");
+                result = false;
+            }
+#else
+            return false;
+#endif
+
+            if (paramSize != sizeof(surfaceInfo))
+            {
+                log_error("Invalid CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR parameter "
+                          "size: %i, expected: %i\n",
+                          paramSize, sizeof(surfaceInfo));
+                result = false;
+            }
+
+            paramSize = 0;
+            cl_dx9_media_adapter_type_khr mediaAdapterType;
+            if (clGetMemObjectInfo(object, CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR,
+                                   sizeof(mediaAdapterType), &mediaAdapterType,
+                                   &paramSize)
+                != CL_SUCCESS)
+            {
+                log_error("clGetImageInfo(CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR) "
+                          "failed\n");
+                result = false;
+            }
+
+            if (adapterType != mediaAdapterType)
+            {
+                log_error("Invalid media adapter type for "
+                          "CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR\n");
+                result = false;
+            }
+
+            if (paramSize != sizeof(mediaAdapterType))
+            {
+                log_error("Invalid CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR parameter "
+                          "size: %i, expected: %i\n",
+                          paramSize, sizeof(mediaAdapterType));
+                result = false;
+            }
+        }
+        break;
+        default:
+            log_error("GetMemObjInfo(): Unknown adapter type!\n");
+            return false;
+            break;
+    }
+
+    return result;
+}
+
+bool ImageInfoVerify(cl_dx9_media_adapter_type_khr adapterType,
+                     const std::vector<cl_mem> &memObjList, unsigned int width,
+                     unsigned int height,
+                     std::auto_ptr<CSurfaceWrapper> &surface,
+                     void *sharedHandle)
+{
+    if (memObjList.size() != 2 && memObjList.size() != 3)
+    {
+        log_error("ImageInfoVerify(): Invalid object list parameter\n");
+        return false;
+    }
+
+    cl_image_format formatPlane;
+    formatPlane.image_channel_data_type = CL_UNORM_INT8;
+    formatPlane.image_channel_order = CL_R;
+
+    // plane 0 verification
+    if (!GetImageInfo(memObjList[0], formatPlane, sizeof(cl_uchar),
+                      width * sizeof(cl_uchar), 0, width, height, 0, 0))
+    {
+        log_error("clGetImageInfo failed\n");
+        return false;
+    }
+
+    switch (memObjList.size())
+    {
+        case 2: {
+            formatPlane.image_channel_data_type = CL_UNORM_INT8;
+            formatPlane.image_channel_order = CL_RG;
+            if (!GetImageInfo(memObjList[1], formatPlane, sizeof(cl_uchar) * 2,
+                              width * sizeof(cl_uchar), 0, width / 2,
+                              height / 2, 0, 1))
+            {
+                log_error("clGetImageInfo failed\n");
+                return false;
+            }
+        }
+        break;
+        case 3: {
+            if (!GetImageInfo(memObjList[1], formatPlane, sizeof(cl_uchar),
+                              width * sizeof(cl_uchar) / 2, 0, width / 2,
+                              height / 2, 0, 1))
+            {
+                log_error("clGetImageInfo failed\n");
+                return false;
+            }
+
+            if (!GetImageInfo(memObjList[2], formatPlane, sizeof(cl_uchar),
+                              width * sizeof(cl_uchar) / 2, 0, width / 2,
+                              height / 2, 0, 2))
+            {
+                log_error("clGetImageInfo failed\n");
+                return false;
+            }
+        }
+        break;
+        default:
+            log_error("ImageInfoVerify(): Invalid object list parameter\n");
+            return false;
+            break;
+    }
+
+    for (size_t i = 0; i < memObjList.size(); ++i)
+    {
+        if (!GetMemObjInfo(memObjList[i], adapterType, surface, sharedHandle))
+        {
+            log_error("clGetMemObjInfo(%i) failed\n", i);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool ImageFormatCheck(cl_context context, cl_mem_object_type imageType,
+                      const cl_image_format imageFormatCheck)
+{
+    cl_uint imageFormatsNum = 0;
+    cl_int error = clGetSupportedImageFormats(
+        context, CL_MEM_READ_WRITE, imageType, 0, 0, &imageFormatsNum);
+    if (error != CL_SUCCESS)
+    {
+        log_error("clGetSupportedImageFormats failed\n");
+        return false;
+    }
+
+    if (imageFormatsNum < 1)
+    {
+        log_error("Invalid image format number returned by "
+                  "clGetSupportedImageFormats\n");
+        return false;
+    }
+
+    std::vector<cl_image_format> imageFormats(imageFormatsNum);
+    error = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, imageType,
+                                       imageFormatsNum, &imageFormats[0], 0);
+    if (error != CL_SUCCESS)
+    {
+        log_error("clGetSupportedImageFormats failed\n");
+        return false;
+    }
+
+    for (cl_uint i = 0; i < imageFormatsNum; ++i)
+    {
+        if (imageFormats[i].image_channel_data_type
+                == imageFormatCheck.image_channel_data_type
+            && imageFormats[i].image_channel_order
+                == imageFormatCheck.image_channel_order)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+unsigned int ChannelNum(TSurfaceFormat surfaceFormat)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_R32F:
+        case SURFACE_FORMAT_R16F:
+        case SURFACE_FORMAT_L16:
+        case SURFACE_FORMAT_A8:
+        case SURFACE_FORMAT_L8: return 1; break;
+        case SURFACE_FORMAT_G32R32F:
+        case SURFACE_FORMAT_G16R16F:
+        case SURFACE_FORMAT_G16R16:
+        case SURFACE_FORMAT_A8L8: return 2; break;
+        case SURFACE_FORMAT_NV12:
+        case SURFACE_FORMAT_YV12: return 3; break;
+        case SURFACE_FORMAT_A32B32G32R32F:
+        case SURFACE_FORMAT_A16B16G16R16F:
+        case SURFACE_FORMAT_A16B16G16R16:
+        case SURFACE_FORMAT_A8B8G8R8:
+        case SURFACE_FORMAT_X8B8G8R8:
+        case SURFACE_FORMAT_A8R8G8B8:
+        case SURFACE_FORMAT_X8R8G8B8: return 4; break;
+        default:
+            log_error("ChannelNum(): unknown surface format!\n");
+            return 0;
+            break;
+    }
+}
+
+unsigned int PlanesNum(TSurfaceFormat surfaceFormat)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_R32F:
+        case SURFACE_FORMAT_R16F:
+        case SURFACE_FORMAT_L16:
+        case SURFACE_FORMAT_A8:
+        case SURFACE_FORMAT_L8:
+        case SURFACE_FORMAT_G32R32F:
+        case SURFACE_FORMAT_G16R16F:
+        case SURFACE_FORMAT_G16R16:
+        case SURFACE_FORMAT_A8L8:
+        case SURFACE_FORMAT_A32B32G32R32F:
+        case SURFACE_FORMAT_A16B16G16R16F:
+        case SURFACE_FORMAT_A16B16G16R16:
+        case SURFACE_FORMAT_A8B8G8R8:
+        case SURFACE_FORMAT_X8B8G8R8:
+        case SURFACE_FORMAT_A8R8G8B8:
+        case SURFACE_FORMAT_X8R8G8B8: return 1; break;
+        case SURFACE_FORMAT_NV12: return 2; break;
+        case SURFACE_FORMAT_YV12: return 3; break;
+        default:
+            log_error("PlanesNum(): unknown surface format!\n");
+            return 0;
+            break;
+    }
+}
+
+#if defined(_WIN32)
+D3DFORMAT SurfaceFormatToD3D(TSurfaceFormat surfaceFormat)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_R32F: return D3DFMT_R32F; break;
+        case SURFACE_FORMAT_R16F: return D3DFMT_R16F; break;
+        case SURFACE_FORMAT_L16: return D3DFMT_L16; break;
+        case SURFACE_FORMAT_A8: return D3DFMT_A8; break;
+        case SURFACE_FORMAT_L8: return D3DFMT_L8; break;
+        case SURFACE_FORMAT_G32R32F: return D3DFMT_G32R32F; break;
+        case SURFACE_FORMAT_G16R16F: return D3DFMT_G16R16F; break;
+        case SURFACE_FORMAT_G16R16: return D3DFMT_G16R16; break;
+        case SURFACE_FORMAT_A8L8: return D3DFMT_A8L8; break;
+        case SURFACE_FORMAT_A32B32G32R32F: return D3DFMT_A32B32G32R32F; break;
+        case SURFACE_FORMAT_A16B16G16R16F: return D3DFMT_A16B16G16R16F; break;
+        case SURFACE_FORMAT_A16B16G16R16: return D3DFMT_A16B16G16R16; break;
+        case SURFACE_FORMAT_A8B8G8R8: return D3DFMT_A8B8G8R8; break;
+        case SURFACE_FORMAT_X8B8G8R8: return D3DFMT_X8B8G8R8; break;
+        case SURFACE_FORMAT_A8R8G8B8: return D3DFMT_A8R8G8B8; break;
+        case SURFACE_FORMAT_X8R8G8B8: return D3DFMT_X8R8G8B8; break;
+        case SURFACE_FORMAT_NV12:
+            return static_cast<D3DFORMAT>(MAKEFOURCC('N', 'V', '1', '2'));
+            break;
+        case SURFACE_FORMAT_YV12:
+            return static_cast<D3DFORMAT>(MAKEFOURCC('Y', 'V', '1', '2'));
+            break;
+        default:
+            log_error("SurfaceFormatToD3D(): unknown surface format!\n");
+            return D3DFMT_R32F;
+            break;
+    }
+}
+#endif
+
+bool DeviceCreate(cl_dx9_media_adapter_type_khr adapterType,
+                  std::auto_ptr<CDeviceWrapper> &device)
+{
+    switch (adapterType)
+    {
+#if defined(_WIN32)
+        case CL_ADAPTER_D3D9_KHR:
+            device = std::auto_ptr<CDeviceWrapper>(new CD3D9Wrapper());
+            break;
+        case CL_ADAPTER_D3D9EX_KHR:
+            device = std::auto_ptr<CDeviceWrapper>(new CD3D9ExWrapper());
+            break;
+        case CL_ADAPTER_DXVA_KHR:
+            device = std::auto_ptr<CDeviceWrapper>(new CDXVAWrapper());
+            break;
+#endif
+        default:
+            log_error("DeviceCreate(): Unknown adapter type!\n");
+            return false;
+            break;
+    }
+
+    return device->Status();
+}
+
+bool SurfaceFormatCheck(cl_dx9_media_adapter_type_khr adapterType,
+                        const CDeviceWrapper &device,
+                        TSurfaceFormat surfaceFormat)
+{
+    switch (adapterType)
+    {
+#if defined(_WIN32)
+        case CL_ADAPTER_D3D9_KHR:
+        case CL_ADAPTER_D3D9EX_KHR:
+        case CL_ADAPTER_DXVA_KHR: {
+            D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
+            LPDIRECT3D9 d3d9 = static_cast<LPDIRECT3D9>(device.D3D());
+            D3DDISPLAYMODE d3ddm;
+            d3d9->GetAdapterDisplayMode(device.AdapterIdx(), &d3ddm);
+
+            if (FAILED(d3d9->CheckDeviceFormat(D3DADAPTER_DEFAULT,
+                                               D3DDEVTYPE_HAL, d3ddm.Format, 0,
+                                               D3DRTYPE_SURFACE, d3dFormat)))
+                return false;
+        }
+        break;
+#endif
+        default:
+            log_error("SurfaceFormatCheck(): Unknown adapter type!\n");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+bool SurfaceFormatToOCL(TSurfaceFormat surfaceFormat, cl_image_format &format)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_R32F:
+            format.image_channel_order = CL_R;
+            format.image_channel_data_type = CL_FLOAT;
+            break;
+        case SURFACE_FORMAT_R16F:
+            format.image_channel_order = CL_R;
+            format.image_channel_data_type = CL_HALF_FLOAT;
+            break;
+        case SURFACE_FORMAT_L16:
+            format.image_channel_order = CL_R;
+            format.image_channel_data_type = CL_UNORM_INT16;
+            break;
+        case SURFACE_FORMAT_A8:
+            format.image_channel_order = CL_A;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_L8:
+            format.image_channel_order = CL_R;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_G32R32F:
+            format.image_channel_order = CL_RG;
+            format.image_channel_data_type = CL_FLOAT;
+            break;
+        case SURFACE_FORMAT_G16R16F:
+            format.image_channel_order = CL_RG;
+            format.image_channel_data_type = CL_HALF_FLOAT;
+            break;
+        case SURFACE_FORMAT_G16R16:
+            format.image_channel_order = CL_RG;
+            format.image_channel_data_type = CL_UNORM_INT16;
+            break;
+        case SURFACE_FORMAT_A8L8:
+            format.image_channel_order = CL_RG;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_A32B32G32R32F:
+            format.image_channel_order = CL_RGBA;
+            format.image_channel_data_type = CL_FLOAT;
+            break;
+        case SURFACE_FORMAT_A16B16G16R16F:
+            format.image_channel_order = CL_RGBA;
+            format.image_channel_data_type = CL_HALF_FLOAT;
+            break;
+        case SURFACE_FORMAT_A16B16G16R16:
+            format.image_channel_order = CL_RGBA;
+            format.image_channel_data_type = CL_UNORM_INT16;
+            break;
+        case SURFACE_FORMAT_A8B8G8R8:
+            format.image_channel_order = CL_RGBA;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_X8B8G8R8:
+            format.image_channel_order = CL_RGBA;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_A8R8G8B8:
+            format.image_channel_order = CL_BGRA;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_X8R8G8B8:
+            format.image_channel_order = CL_BGRA;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_NV12:
+            format.image_channel_order = CL_R;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        case SURFACE_FORMAT_YV12:
+            format.image_channel_order = CL_R;
+            format.image_channel_data_type = CL_UNORM_INT8;
+            break;
+        default:
+            log_error("SurfaceFormatToOCL(): Unknown surface format!\n");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+void SurfaceFormatToString(TSurfaceFormat surfaceFormat, std::string &str)
+{
+    switch (surfaceFormat)
+    {
+        case SURFACE_FORMAT_R32F: str = "R32F"; break;
+        case SURFACE_FORMAT_R16F: str = "R16F"; break;
+        case SURFACE_FORMAT_L16: str = "L16"; break;
+        case SURFACE_FORMAT_A8: str = "A8"; break;
+        case SURFACE_FORMAT_L8: str = "L8"; break;
+        case SURFACE_FORMAT_G32R32F: str = "G32R32F"; break;
+        case SURFACE_FORMAT_G16R16F: str = "G16R16F"; break;
+        case SURFACE_FORMAT_G16R16: str = "G16R16"; break;
+        case SURFACE_FORMAT_A8L8: str = "A8L8"; break;
+        case SURFACE_FORMAT_A32B32G32R32F: str = "A32B32G32R32F"; break;
+        case SURFACE_FORMAT_A16B16G16R16F: str = "A16B16G16R16F"; break;
+        case SURFACE_FORMAT_A16B16G16R16: str = "A16B16G16R16"; break;
+        case SURFACE_FORMAT_A8B8G8R8: str = "A8B8G8R8"; break;
+        case SURFACE_FORMAT_X8B8G8R8: str = "X8B8G8R8"; break;
+        case SURFACE_FORMAT_A8R8G8B8: str = "A8R8G8B8"; break;
+        case SURFACE_FORMAT_X8R8G8B8: str = "X8R8G8B8"; break;
+        case SURFACE_FORMAT_NV12: str = "NV12"; break;
+        case SURFACE_FORMAT_YV12: str = "YV12"; break;
+        default:
+            log_error("SurfaceFormatToString(): unknown surface format!\n");
+            str = "unknown";
+            break;
+    }
+}
+
+bool MediaSurfaceCreate(cl_dx9_media_adapter_type_khr adapterType,
+                        unsigned int width, unsigned int height,
+                        TSurfaceFormat surfaceFormat, CDeviceWrapper &device,
+                        std::auto_ptr<CSurfaceWrapper> &surface,
+                        bool sharedHandle, void **objectSharedHandle)
+{
+    switch (adapterType)
+    {
+#if defined(_WIN32)
+        case CL_ADAPTER_D3D9_KHR: {
+            surface =
+                std::auto_ptr<CD3D9SurfaceWrapper>(new CD3D9SurfaceWrapper);
+            CD3D9SurfaceWrapper *d3dSurface =
+                static_cast<CD3D9SurfaceWrapper *>(surface.get());
+            HRESULT hr = 0;
+            D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
+            LPDIRECT3DDEVICE9 d3d9Device = (LPDIRECT3DDEVICE9)device.Device();
+            hr = d3d9Device->CreateOffscreenPlainSurface(
+                width, height, d3dFormat, D3DPOOL_DEFAULT, &(*d3dSurface),
+                sharedHandle ? objectSharedHandle : 0);
+
+            if (FAILED(hr))
+            {
+                log_error("CreateOffscreenPlainSurface failed\n");
+                return false;
+            }
+        }
+        break;
+        case CL_ADAPTER_D3D9EX_KHR: {
+            surface =
+                std::auto_ptr<CD3D9SurfaceWrapper>(new CD3D9SurfaceWrapper);
+            CD3D9SurfaceWrapper *d3dSurface =
+                static_cast<CD3D9SurfaceWrapper *>(surface.get());
+            HRESULT hr = 0;
+            D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
+            LPDIRECT3DDEVICE9EX d3d9ExDevice =
+                (LPDIRECT3DDEVICE9EX)device.Device();
+            hr = d3d9ExDevice->CreateOffscreenPlainSurface(
+                width, height, d3dFormat, D3DPOOL_DEFAULT, &(*d3dSurface),
+                sharedHandle ? objectSharedHandle : 0);
+
+            if (FAILED(hr))
+            {
+                log_error("CreateOffscreenPlainSurface failed\n");
+                return false;
+            }
+        }
+        break;
+        case CL_ADAPTER_DXVA_KHR: {
+            surface =
+                std::auto_ptr<CD3D9SurfaceWrapper>(new CD3D9SurfaceWrapper);
+            CD3D9SurfaceWrapper *d3dSurface =
+                static_cast<CD3D9SurfaceWrapper *>(surface.get());
+            HRESULT hr = 0;
+            D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
+            IDXVAHD_Device *dxvaDevice = (IDXVAHD_Device *)device.Device();
+            hr = dxvaDevice->CreateVideoSurface(
+                width, height, d3dFormat, D3DPOOL_DEFAULT, 0,
+                DXVAHD_SURFACE_TYPE_VIDEO_INPUT, 1, &(*d3dSurface),
+                sharedHandle ? objectSharedHandle : 0);
+
+            if (FAILED(hr))
+            {
+                log_error("CreateVideoSurface failed\n");
+                return false;
+            }
+        }
+        break;
+#endif
+        default:
+            log_error("MediaSurfaceCreate(): Unknown adapter type!\n");
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+cl_int deviceExistForCLTest(
+    cl_platform_id platform, cl_dx9_media_adapter_type_khr media_adapters_type,
+    void *media_adapters, CResult &result,
+    TSharedHandleType sharedHandle /*default SHARED_HANDLE_ENABLED*/
+)
+{
+    cl_int _error;
+    cl_uint devicesAllNum = 0;
+    std::string sharedHandleStr =
+        (sharedHandle == SHARED_HANDLE_ENABLED) ? "yes" : "no";
+    std::string adapterStr;
+    AdapterToString(media_adapters_type, adapterStr);
+
+    _error = clGetDeviceIDsFromDX9MediaAdapterKHR(
+        platform, 1, &media_adapters_type, &media_adapters,
+        CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 0, 0, &devicesAllNum);
+
+    if (_error != CL_SUCCESS)
+    {
+        if (_error != CL_DEVICE_NOT_FOUND)
+        {
+            log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n",
+                      IGetErrorString(_error));
+            result.ResultSub(CResult::TEST_ERROR);
+        }
+        else
+        {
+            log_info("Skipping test case, device type is not supported by a "
+                     "device (adapter type: %s, shared handle: %s)\n",
+                     adapterStr.c_str(), sharedHandleStr.c_str());
+            result.ResultSub(CResult::TEST_NOTSUPPORTED);
+        }
+    }
+
+    return _error;
+}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/utils.h b/test_conformance/extensions/cl_khr_dx9_media_sharing/utils.h
new file mode 100644
index 0000000000..56c0fc2c4d
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/utils.h
@@ -0,0 +1,215 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef __UTILS_KHR_MEDIA_H
+#define __UTILS_KHR_MEDIA_H
+
+#include <string>
+#include <iostream>
+#include <memory>
+#include <vector>
+#include "wrappers.h"
+#include "CL/cl_dx9_media_sharing.h"
+
+#include "harness/typeWrappers.h"
+
+
+extern clGetDeviceIDsFromDX9MediaAdapterKHR_fn
+    clGetDeviceIDsFromDX9MediaAdapterKHR;
+extern clCreateFromDX9MediaSurfaceKHR_fn clCreateFromDX9MediaSurfaceKHR;
+extern clEnqueueAcquireDX9MediaSurfacesKHR_fn
+    clEnqueueAcquireDX9MediaSurfacesKHR;
+extern clEnqueueReleaseDX9MediaSurfacesKHR_fn
+    clEnqueueReleaseDX9MediaSurfacesKHR;
+
+extern cl_platform_id gPlatformIDdetected;
+extern cl_device_id gDeviceIDdetected;
+extern cl_device_type gDeviceTypeSelected;
+
+#define NL "\n"
+#define TEST_NOT_IMPLEMENTED -1
+#define TEST_NOT_SUPPORTED -2
+
+enum TSurfaceFormat
+{
+    SURFACE_FORMAT_NV12,
+    SURFACE_FORMAT_YV12,
+    SURFACE_FORMAT_R32F,
+    SURFACE_FORMAT_R16F,
+    SURFACE_FORMAT_L16,
+    SURFACE_FORMAT_A8,
+    SURFACE_FORMAT_L8,
+    SURFACE_FORMAT_G32R32F,
+    SURFACE_FORMAT_G16R16F,
+    SURFACE_FORMAT_G16R16,
+    SURFACE_FORMAT_A8L8,
+    SURFACE_FORMAT_A32B32G32R32F,
+    SURFACE_FORMAT_A16B16G16R16F,
+    SURFACE_FORMAT_A16B16G16R16,
+    SURFACE_FORMAT_A8B8G8R8,
+    SURFACE_FORMAT_X8B8G8R8,
+    SURFACE_FORMAT_A8R8G8B8,
+    SURFACE_FORMAT_X8R8G8B8,
+};
+
+enum TContextFuncType
+{
+    CONTEXT_CREATE_DEFAULT,
+    CONTEXT_CREATE_FROM_TYPE,
+};
+
+enum TSharedHandleType
+{
+    SHARED_HANDLE_ENABLED,
+    SHARED_HANDLE_DISABLED,
+};
+
+class CResult {
+public:
+    enum TTestResult
+    {
+        TEST_NORESULT,
+        TEST_NOTSUPPORTED,
+        TEST_PASS,
+        TEST_FAIL,
+        TEST_ERROR,
+    };
+
+    CResult();
+    ~CResult();
+
+    void ResultSub(TTestResult result);
+    TTestResult ResultLast() const;
+    int Result() const;
+
+private:
+    TTestResult _result;
+    TTestResult _resultLast;
+};
+
+void FunctionContextCreateToString(TContextFuncType contextCreateFunction,
+                                   std::string &contextFunction);
+void AdapterToString(cl_dx9_media_adapter_type_khr adapterType,
+                     std::string &adapter);
+cl_context_info
+AdapterTypeToContextInfo(cl_dx9_media_adapter_type_khr adapterType);
+
+// YUV utils
+void YUVGenerateNV12(std::vector<cl_uchar> &yuv, unsigned int width,
+                     unsigned int height, cl_uchar valueMin, cl_uchar valueMax,
+                     double valueAdd = 0.0);
+void YUVGenerateYV12(std::vector<cl_uchar> &yuv, unsigned int width,
+                     unsigned int height, cl_uchar valueMin, cl_uchar valueMax,
+                     double valueAdd = 0.0);
+bool YUVGenerate(TSurfaceFormat surfaceFormat, std::vector<cl_uchar> &yuv,
+                 unsigned int width, unsigned int height, cl_uchar valueMin,
+                 cl_uchar valueMax, double valueAdd = 0.0);
+bool YUVSurfaceSetNV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       const std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height);
+bool YUVSurfaceSetYV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       const std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height);
+bool YUVSurfaceSet(TSurfaceFormat surfaceFormat,
+                   std::auto_ptr<CSurfaceWrapper> &surface,
+                   const std::vector<cl_uchar> &yuv, unsigned int width,
+                   unsigned int height);
+bool YUVSurfaceGetNV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height);
+bool YUVSurfaceGetYV12(std::auto_ptr<CSurfaceWrapper> &surface,
+                       std::vector<cl_uchar> &yuv, unsigned int width,
+                       unsigned int height);
+bool YUVSurfaceGet(TSurfaceFormat surfaceFormat,
+                   std::auto_ptr<CSurfaceWrapper> &surface,
+                   std::vector<cl_uchar> &yuv, unsigned int width,
+                   unsigned int height);
+bool YUVCompareNV12(const std::vector<cl_uchar> &yuvTest,
+                    const std::vector<cl_uchar> &yuvRef, unsigned int width,
+                    unsigned int height);
+bool YUVCompareYV12(const std::vector<cl_uchar> &yuvTest,
+                    const std::vector<cl_uchar> &yuvRef, unsigned int width,
+                    unsigned int height);
+bool YUVCompare(TSurfaceFormat surfaceFormat,
+                const std::vector<cl_uchar> &yuvTest,
+                const std::vector<cl_uchar> &yuvRef, unsigned int width,
+                unsigned int height);
+
+// other types utils
+void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                  std::vector<float> &data, unsigned int width,
+                  unsigned int height, unsigned int channelNum,
+                  float cmin = 0.0f, float cmax = 1.0f, float add = 0.0f);
+void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                  std::vector<cl_half> &data, unsigned int width,
+                  unsigned int height, unsigned int channelNum,
+                  float cmin = 0.0f, float cmax = 1.0f, float add = 0.0f);
+void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                  std::vector<cl_uchar> &data, unsigned int width,
+                  unsigned int height, unsigned int channelNum,
+                  float cmin = 0.0f, float cmax = 1.0f, float add = 0.0f);
+bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                 const std::vector<cl_float> &dataTest,
+                 const std::vector<cl_float> &dataExp, unsigned int width,
+                 unsigned int height, unsigned int channelNum);
+bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                 const std::vector<cl_half> &dataTest,
+                 const std::vector<cl_half> &dataExp, unsigned int width,
+                 unsigned int height, unsigned int channelNum);
+bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type,
+                 const std::vector<cl_uchar> &dataTest,
+                 const std::vector<cl_uchar> &dataExp, unsigned int width,
+                 unsigned int height, unsigned int channelNum);
+
+bool GetImageInfo(cl_mem object, cl_image_format formatExp,
+                  size_t elementSizeExp, size_t rowPitchExp,
+                  size_t slicePitchExp, size_t widthExp, size_t heightExp,
+                  size_t depthExp, unsigned int planeExp);
+bool GetMemObjInfo(cl_mem object, cl_dx9_media_adapter_type_khr adapterType,
+                   std::auto_ptr<CSurfaceWrapper> &surface,
+                   void *shareHandleExp);
+bool ImageInfoVerify(cl_dx9_media_adapter_type_khr adapterType,
+                     const std::vector<cl_mem> &memObjList, unsigned int width,
+                     unsigned int height,
+                     std::auto_ptr<CSurfaceWrapper> &surface,
+                     void *sharedHandle);
+bool ImageFormatCheck(cl_context context, cl_mem_object_type imageType,
+                      const cl_image_format imageFormatCheck);
+unsigned int ChannelNum(TSurfaceFormat surfaceFormat);
+unsigned int PlanesNum(TSurfaceFormat surfaceFormat);
+
+#if defined(_WIN32)
+D3DFORMAT SurfaceFormatToD3D(TSurfaceFormat surfaceFormat);
+#endif
+
+bool DeviceCreate(cl_dx9_media_adapter_type_khr adapterType,
+                  std::auto_ptr<CDeviceWrapper> &device);
+bool SurfaceFormatCheck(cl_dx9_media_adapter_type_khr adapterType,
+                        const CDeviceWrapper &device,
+                        TSurfaceFormat surfaceFormat);
+bool SurfaceFormatToOCL(TSurfaceFormat surfaceFormat, cl_image_format &format);
+void SurfaceFormatToString(TSurfaceFormat surfaceFormat, std::string &str);
+bool MediaSurfaceCreate(cl_dx9_media_adapter_type_khr adapterType,
+                        unsigned int width, unsigned int height,
+                        TSurfaceFormat surfaceFormat, CDeviceWrapper &device,
+                        std::auto_ptr<CSurfaceWrapper> &surface,
+                        bool sharedHandle, void **objectSharedHandle);
+
+cl_int
+deviceExistForCLTest(cl_platform_id platform,
+                     cl_dx9_media_adapter_type_khr media_adapters_type,
+                     void *media_adapters, CResult &result,
+                     TSharedHandleType sharedHandle = SHARED_HANDLE_DISABLED);
+#endif // __UTILS_KHR_MEDIA_H
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.cpp b/test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.cpp
new file mode 100644
index 0000000000..e156584e72
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.cpp
@@ -0,0 +1,463 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "wrappers.h"
+#include "harness/errorHelpers.h"
+
+LPCTSTR CDeviceWrapper::WINDOW_TITLE = _T( "cl_khr_dx9_media_sharing" );
+const int CDeviceWrapper::WINDOW_WIDTH = 256;
+const int CDeviceWrapper::WINDOW_HEIGHT = 256;
+CDeviceWrapper::TAccelerationType CDeviceWrapper::accelerationType =
+    CDeviceWrapper::ACCELERATION_HW;
+
+#if defined(_WIN32)
+const D3DFORMAT CDXVAWrapper::RENDER_TARGET_FORMAT = D3DFMT_X8R8G8B8;
+const D3DFORMAT CDXVAWrapper::VIDEO_FORMAT = D3DFMT_X8R8G8B8;
+const unsigned int CDXVAWrapper::VIDEO_FPS = 60;
+#endif
+
+#if defined(_WIN32)
+static LRESULT WINAPI WndProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)
+{
+    switch (msg)
+    {
+        case WM_DESTROY: PostQuitMessage(0); return 0;
+        case WM_PAINT: ValidateRect(hWnd, 0); return 0;
+        default: break;
+    }
+
+    return DefWindowProc(hWnd, msg, wParam, lParam);
+}
+#endif
+
+CDeviceWrapper::CDeviceWrapper()
+#if defined(_WIN32)
+    : _hInstance(NULL), _hWnd(NULL)
+#endif
+{}
+
+void CDeviceWrapper::WindowInit()
+{
+#if defined(_WIN32)
+    _hInstance = GetModuleHandle(NULL);
+    static WNDCLASSEX wc = {
+        sizeof(WNDCLASSEX), CS_CLASSDC, WndProc, 0L,   0L,
+        _hInstance,         NULL,       NULL,    NULL, NULL,
+        WINDOW_TITLE,       NULL
+    };
+
+    RegisterClassEx(&wc);
+
+    _hWnd = CreateWindow(WINDOW_TITLE, WINDOW_TITLE, WS_OVERLAPPEDWINDOW, 0, 0,
+                         WINDOW_WIDTH, WINDOW_HEIGHT, NULL, NULL, wc.hInstance,
+                         NULL);
+
+    if (!_hWnd)
+    {
+        log_error("Failed to create window");
+        return;
+    }
+
+    ShowWindow(_hWnd, SW_SHOWDEFAULT);
+    UpdateWindow(_hWnd);
+#endif
+}
+
+void CDeviceWrapper::WindowDestroy()
+{
+#if defined(_WIN32)
+    if (_hWnd) DestroyWindow(_hWnd);
+    _hWnd = NULL;
+#endif
+}
+
+#if defined(_WIN32)
+HWND CDeviceWrapper::WindowHandle() const { return _hWnd; }
+#endif
+
+int CDeviceWrapper::WindowWidth() const { return WINDOW_WIDTH; }
+
+int CDeviceWrapper::WindowHeight() const { return WINDOW_HEIGHT; }
+
+CDeviceWrapper::TAccelerationType CDeviceWrapper::AccelerationType()
+{
+    return accelerationType;
+}
+
+void CDeviceWrapper::AccelerationType(TAccelerationType accelerationTypeNew)
+{
+    accelerationType = accelerationTypeNew;
+}
+
+CDeviceWrapper::~CDeviceWrapper() { WindowDestroy(); }
+
+#if defined(_WIN32)
+CD3D9Wrapper::CD3D9Wrapper()
+    : _d3d9(NULL), _d3dDevice(NULL), _status(DEVICE_PASS), _adapterIdx(0),
+      _adapterFound(false)
+{
+    WindowInit();
+
+    _d3d9 = Direct3DCreate9(D3D_SDK_VERSION);
+    if (!_d3d9)
+    {
+        log_error("Direct3DCreate9 failed\n");
+        _status = DEVICE_FAIL;
+    }
+}
+
+CD3D9Wrapper::~CD3D9Wrapper()
+{
+    Destroy();
+
+    if (_d3d9) _d3d9->Release();
+    _d3d9 = 0;
+}
+
+void CD3D9Wrapper::Destroy()
+{
+    if (_d3dDevice) _d3dDevice->Release();
+    _d3dDevice = 0;
+}
+
+cl_int CD3D9Wrapper::Init()
+{
+    if (!WindowHandle())
+    {
+        log_error("D3D9: Window is not created\n");
+        _status = DEVICE_FAIL;
+        return DEVICE_FAIL;
+    }
+
+    if (!_d3d9 || DEVICE_PASS != _status || !_adapterFound) return false;
+
+    _d3d9->GetAdapterDisplayMode(_adapterIdx - 1, &_d3ddm);
+
+    D3DPRESENT_PARAMETERS d3dParams;
+    ZeroMemory(&d3dParams, sizeof(d3dParams));
+
+    d3dParams.Windowed = TRUE;
+    d3dParams.BackBufferCount = 1;
+    d3dParams.SwapEffect = D3DSWAPEFFECT_DISCARD;
+    d3dParams.hDeviceWindow = WindowHandle();
+    d3dParams.BackBufferWidth = WindowWidth();
+    d3dParams.BackBufferHeight = WindowHeight();
+    d3dParams.BackBufferFormat = _d3ddm.Format;
+
+    DWORD processingType = (AccelerationType() == ACCELERATION_HW)
+        ? D3DCREATE_HARDWARE_VERTEXPROCESSING
+        : D3DCREATE_SOFTWARE_VERTEXPROCESSING;
+
+    if (FAILED(_d3d9->CreateDevice(_adapterIdx - 1, D3DDEVTYPE_HAL,
+                                   WindowHandle(), processingType, &d3dParams,
+                                   &_d3dDevice)))
+    {
+        log_error("CreateDevice failed\n");
+        _status = DEVICE_FAIL;
+        return DEVICE_FAIL;
+    }
+
+    _d3dDevice->BeginScene();
+    _d3dDevice->Clear(0, NULL, D3DCLEAR_TARGET, 0, 1.0f, 0);
+    _d3dDevice->EndScene();
+
+    return true;
+}
+
+void *CD3D9Wrapper::D3D() const { return _d3d9; }
+
+void *CD3D9Wrapper::Device() const { return _d3dDevice; }
+
+D3DFORMAT CD3D9Wrapper::Format() { return _d3ddm.Format; }
+
+D3DADAPTER_IDENTIFIER9 CD3D9Wrapper::Adapter() { return _adapter; }
+
+TDeviceStatus CD3D9Wrapper::Status() const { return _status; }
+
+bool CD3D9Wrapper::AdapterNext()
+{
+    if (DEVICE_PASS != _status) return false;
+
+    _adapterFound = false;
+    for (; _adapterIdx < _d3d9->GetAdapterCount();)
+    {
+        ++_adapterIdx;
+        D3DCAPS9 caps;
+        if (FAILED(
+                _d3d9->GetDeviceCaps(_adapterIdx - 1, D3DDEVTYPE_HAL, &caps)))
+            continue;
+
+        if (FAILED(_d3d9->GetAdapterIdentifier(_adapterIdx - 1, 0, &_adapter)))
+        {
+            log_error("D3D9: GetAdapterIdentifier failed\n");
+            _status = DEVICE_FAIL;
+            return false;
+        }
+
+        _adapterFound = true;
+
+        Destroy();
+        if (!Init())
+        {
+            _status = DEVICE_FAIL;
+            _adapterFound = false;
+        }
+        break;
+    }
+
+    return _adapterFound;
+}
+
+unsigned int CD3D9Wrapper::AdapterIdx() const { return _adapterIdx - 1; }
+
+
+CD3D9ExWrapper::CD3D9ExWrapper()
+    : _d3d9Ex(NULL), _d3dDeviceEx(NULL), _status(DEVICE_PASS), _adapterIdx(0),
+      _adapterFound(false)
+{
+    WindowInit();
+
+    HRESULT result = Direct3DCreate9Ex(D3D_SDK_VERSION, &_d3d9Ex);
+    if (FAILED(result) || !_d3d9Ex)
+    {
+        log_error("Direct3DCreate9Ex failed\n");
+        _status = DEVICE_FAIL;
+    }
+}
+
+CD3D9ExWrapper::~CD3D9ExWrapper()
+{
+    Destroy();
+
+    if (_d3d9Ex) _d3d9Ex->Release();
+    _d3d9Ex = 0;
+}
+
+void *CD3D9ExWrapper::D3D() const { return _d3d9Ex; }
+
+void *CD3D9ExWrapper::Device() const { return _d3dDeviceEx; }
+
+D3DFORMAT CD3D9ExWrapper::Format() { return _d3ddmEx.Format; }
+
+D3DADAPTER_IDENTIFIER9 CD3D9ExWrapper::Adapter() { return _adapter; }
+
+cl_int CD3D9ExWrapper::Init()
+{
+    if (!WindowHandle())
+    {
+        log_error("D3D9EX: Window is not created\n");
+        _status = DEVICE_FAIL;
+        return DEVICE_FAIL;
+    }
+
+    if (!_d3d9Ex || DEVICE_FAIL == _status || !_adapterFound)
+        return DEVICE_FAIL;
+
+    RECT rect;
+    GetClientRect(WindowHandle(), &rect);
+
+    D3DPRESENT_PARAMETERS d3dParams;
+    ZeroMemory(&d3dParams, sizeof(d3dParams));
+
+    d3dParams.Windowed = TRUE;
+    d3dParams.SwapEffect = D3DSWAPEFFECT_FLIP;
+    d3dParams.BackBufferFormat = D3DFMT_X8R8G8B8;
+    d3dParams.BackBufferWidth = WindowWidth();
+    d3dParams.BackBufferHeight = WindowHeight();
+
+    d3dParams.BackBufferCount = 1;
+    d3dParams.hDeviceWindow = WindowHandle();
+
+    DWORD processingType = (AccelerationType() == ACCELERATION_HW)
+        ? D3DCREATE_HARDWARE_VERTEXPROCESSING
+        : D3DCREATE_SOFTWARE_VERTEXPROCESSING;
+
+    if (FAILED(_d3d9Ex->CreateDeviceEx(_adapterIdx - 1, D3DDEVTYPE_HAL,
+                                       WindowHandle(), processingType,
+                                       &d3dParams, NULL, &_d3dDeviceEx)))
+    {
+        log_error("CreateDeviceEx failed\n");
+        _status = DEVICE_FAIL;
+        return DEVICE_FAIL;
+    }
+
+    _d3dDeviceEx->BeginScene();
+    _d3dDeviceEx->Clear(0, NULL, D3DCLEAR_TARGET, 0, 1.0f, 0);
+    _d3dDeviceEx->EndScene();
+
+    return DEVICE_PASS;
+}
+
+void CD3D9ExWrapper::Destroy()
+{
+    if (_d3dDeviceEx) _d3dDeviceEx->Release();
+    _d3dDeviceEx = 0;
+}
+
+TDeviceStatus CD3D9ExWrapper::Status() const { return _status; }
+
+bool CD3D9ExWrapper::AdapterNext()
+{
+    if (DEVICE_FAIL == _status) return false;
+
+    _adapterFound = false;
+    for (; _adapterIdx < _d3d9Ex->GetAdapterCount();)
+    {
+        ++_adapterIdx;
+        D3DCAPS9 caps;
+        if (FAILED(
+                _d3d9Ex->GetDeviceCaps(_adapterIdx - 1, D3DDEVTYPE_HAL, &caps)))
+            continue;
+
+        if (FAILED(
+                _d3d9Ex->GetAdapterIdentifier(_adapterIdx - 1, 0, &_adapter)))
+        {
+            log_error("D3D9EX: GetAdapterIdentifier failed\n");
+            _status = DEVICE_FAIL;
+            return false;
+        }
+
+        _adapterFound = true;
+        Destroy();
+        if (!Init())
+        {
+            _status = DEVICE_FAIL;
+            _adapterFound = _status;
+        }
+
+        break;
+    }
+
+    return _adapterFound;
+}
+
+unsigned int CD3D9ExWrapper::AdapterIdx() const { return _adapterIdx - 1; }
+
+CDXVAWrapper::CDXVAWrapper()
+    : _dxvaDevice(NULL), _status(DEVICE_PASS), _adapterFound(false)
+{
+    _status = _d3d9.Status();
+}
+
+CDXVAWrapper::~CDXVAWrapper() { DXVAHDDestroy(); }
+
+void *CDXVAWrapper::Device() const { return _dxvaDevice; }
+
+TDeviceStatus CDXVAWrapper::Status() const
+{
+    if (_status == DEVICE_FAIL || _d3d9.Status() == DEVICE_FAIL)
+        return DEVICE_FAIL;
+    else if (_status == DEVICE_NOTSUPPORTED
+             || _d3d9.Status() == DEVICE_NOTSUPPORTED)
+        return DEVICE_NOTSUPPORTED;
+    else
+        return DEVICE_PASS;
+}
+
+bool CDXVAWrapper::AdapterNext()
+{
+    if (DEVICE_PASS != _status) return false;
+
+    _adapterFound = _d3d9.AdapterNext();
+    _status = _d3d9.Status();
+    if (DEVICE_PASS != _status)
+    {
+        _adapterFound = false;
+        return false;
+    }
+
+    if (!_adapterFound) return false;
+
+    DXVAHDDestroy();
+    _status = DXVAHDInit();
+    if (DEVICE_PASS != _status)
+    {
+        _adapterFound = false;
+        return false;
+    }
+
+    return true;
+}
+
+TDeviceStatus CDXVAWrapper::DXVAHDInit()
+{
+    if ((_status == DEVICE_FAIL) || (_d3d9.Status() == DEVICE_FAIL)
+        || !_adapterFound)
+        return DEVICE_FAIL;
+
+    DXVAHD_RATIONAL fps = { VIDEO_FPS, 1 };
+
+    DXVAHD_CONTENT_DESC desc;
+    desc.InputFrameFormat = DXVAHD_FRAME_FORMAT_PROGRESSIVE;
+    desc.InputFrameRate = fps;
+    desc.InputWidth = WindowWidth();
+    desc.InputHeight = WindowHeight();
+    desc.OutputFrameRate = fps;
+    desc.OutputWidth = WindowWidth();
+    desc.OutputHeight = WindowHeight();
+
+#ifdef USE_SOFTWARE_PLUGIN
+    _status = DEVICE_FAIL;
+    return DEVICE_FAIL;
+#endif
+
+    HRESULT hr = DXVAHD_CreateDevice(
+        static_cast<IDirect3DDevice9Ex *>(_d3d9.Device()), &desc,
+        DXVAHD_DEVICE_USAGE_PLAYBACK_NORMAL, NULL, &_dxvaDevice);
+    if (FAILED(hr))
+    {
+        if (hr == E_NOINTERFACE)
+        {
+            log_error(
+                "DXVAHD_CreateDevice skipped due to no supported devices!\n");
+            _status = DEVICE_NOTSUPPORTED;
+        }
+        else
+        {
+            log_error("DXVAHD_CreateDevice failed\n");
+            _status = DEVICE_FAIL;
+        }
+    }
+
+    return _status;
+}
+
+void CDXVAWrapper::DXVAHDDestroy()
+{
+    if (_dxvaDevice) _dxvaDevice->Release();
+    _dxvaDevice = 0;
+}
+
+void *CDXVAWrapper::D3D() const { return _d3d9.D3D(); }
+
+unsigned int CDXVAWrapper::AdapterIdx() const { return _d3d9.AdapterIdx(); }
+
+const CD3D9ExWrapper &CDXVAWrapper::D3D9() const { return _d3d9; }
+
+CD3D9SurfaceWrapper::CD3D9SurfaceWrapper(): mMem(NULL) {}
+
+CD3D9SurfaceWrapper::CD3D9SurfaceWrapper(IDirect3DSurface9 *mem): mMem(mem) {}
+
+CD3D9SurfaceWrapper::~CD3D9SurfaceWrapper()
+{
+    if (mMem != NULL) mMem->Release();
+    mMem = NULL;
+}
+
+#endif
+
+CSurfaceWrapper::CSurfaceWrapper() {}
+
+CSurfaceWrapper::~CSurfaceWrapper() {}
diff --git a/test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.h b/test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.h
new file mode 100644
index 0000000000..e3a7c6d818
--- /dev/null
+++ b/test_conformance/extensions/cl_khr_dx9_media_sharing/wrappers.h
@@ -0,0 +1,195 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef __WRAPPERS_H
+#define __WRAPPERS_H
+
+#if defined(_WIN32)
+#include <d3d9.h>
+#if defined(__MINGW32__)
+#include <rpcsal.h>
+typedef unsigned char UINT8;
+#define __out
+#define __in
+#define __inout
+#define __out_bcount(size)
+#define __out_bcount_opt(size)
+#define __in_opt
+#define __in_ecount(size)
+#define __in_ecount_opt(size)
+#define __out_opt
+#define __out_ecount(size)
+#define __out_ecount_opt(size)
+#define __in_bcount_opt(size)
+#define __inout_opt
+#define __inout_bcount(size)
+#define __in_bcount(size)
+#define __deref_out
+#endif
+#include <dxvahd.h>
+#include <tchar.h>
+#endif
+
+enum TDeviceStatus
+{
+    DEVICE_NOTSUPPORTED,
+    DEVICE_PASS,
+    DEVICE_FAIL,
+};
+
+class CDeviceWrapper {
+public:
+    enum TAccelerationType
+    {
+        ACCELERATION_HW,
+        ACCELERATION_SW,
+    };
+
+    CDeviceWrapper();
+    virtual ~CDeviceWrapper();
+
+    virtual bool AdapterNext() = 0;
+    virtual unsigned int AdapterIdx() const = 0;
+    virtual void *Device() const = 0;
+    virtual TDeviceStatus Status() const = 0;
+    virtual void *D3D() const = 0;
+
+#if defined(_WIN32)
+    HWND WindowHandle() const;
+#endif
+    int WindowWidth() const;
+    int WindowHeight() const;
+    void WindowInit();
+
+
+    static TAccelerationType AccelerationType();
+    static void AccelerationType(TAccelerationType accelerationTypeNew);
+
+private:
+    static LPCTSTR WINDOW_TITLE;
+    static const int WINDOW_WIDTH;
+    static const int WINDOW_HEIGHT;
+    static TAccelerationType accelerationType;
+
+#if defined(_WIN32)
+    HMODULE _hInstance;
+    HWND _hWnd;
+#endif
+
+    void WindowDestroy();
+};
+
+class CSurfaceWrapper {
+public:
+    CSurfaceWrapper();
+    virtual ~CSurfaceWrapper();
+};
+
+#if defined(_WIN32)
+// windows specific wrappers
+class CD3D9Wrapper : public CDeviceWrapper {
+public:
+    CD3D9Wrapper();
+    ~CD3D9Wrapper();
+
+    virtual bool AdapterNext();
+    virtual unsigned int AdapterIdx() const;
+    virtual void *Device() const;
+    virtual TDeviceStatus Status() const;
+    virtual void *D3D() const;
+
+private:
+    LPDIRECT3D9 _d3d9;
+    LPDIRECT3DDEVICE9 _d3dDevice;
+    D3DDISPLAYMODE _d3ddm;
+    D3DADAPTER_IDENTIFIER9 _adapter;
+    TDeviceStatus _status;
+    unsigned int _adapterIdx;
+    bool _adapterFound;
+
+    D3DFORMAT Format();
+    D3DADAPTER_IDENTIFIER9 Adapter();
+    int Init();
+    void Destroy();
+};
+
+class CD3D9ExWrapper : public CDeviceWrapper {
+public:
+    CD3D9ExWrapper();
+    ~CD3D9ExWrapper();
+
+    virtual bool AdapterNext();
+    virtual unsigned int AdapterIdx() const;
+    virtual void *Device() const;
+    virtual TDeviceStatus Status() const;
+    virtual void *D3D() const;
+
+private:
+    LPDIRECT3D9EX _d3d9Ex;
+    LPDIRECT3DDEVICE9EX _d3dDeviceEx;
+    D3DDISPLAYMODEEX _d3ddmEx;
+    D3DADAPTER_IDENTIFIER9 _adapter;
+    TDeviceStatus _status;
+    unsigned int _adapterIdx;
+    bool _adapterFound;
+
+    D3DFORMAT Format();
+    D3DADAPTER_IDENTIFIER9 Adapter();
+    int Init();
+    void Destroy();
+};
+
+class CDXVAWrapper : public CDeviceWrapper {
+public:
+    CDXVAWrapper();
+    ~CDXVAWrapper();
+
+    virtual bool AdapterNext();
+    virtual unsigned int AdapterIdx() const;
+    virtual void *Device() const;
+    virtual TDeviceStatus Status() const;
+    virtual void *D3D() const;
+    const CD3D9ExWrapper &D3D9() const;
+
+private:
+    CD3D9ExWrapper _d3d9;
+    IDXVAHD_Device *_dxvaDevice;
+    TDeviceStatus _status;
+    bool _adapterFound;
+
+    static const D3DFORMAT RENDER_TARGET_FORMAT;
+    static const D3DFORMAT VIDEO_FORMAT;
+    static const unsigned int VIDEO_FPS;
+
+    TDeviceStatus DXVAHDInit();
+    void DXVAHDDestroy();
+};
+
+class CD3D9SurfaceWrapper : public CSurfaceWrapper {
+public:
+    CD3D9SurfaceWrapper();
+    CD3D9SurfaceWrapper(IDirect3DSurface9 *mem);
+    ~CD3D9SurfaceWrapper();
+
+    operator IDirect3DSurface9 *() { return mMem; }
+    IDirect3DSurface9 **operator&() { return &mMem; }
+    IDirect3DSurface9 *operator->() const { return mMem; }
+
+private:
+    IDirect3DSurface9 *mMem;
+};
+#endif
+
+#endif // __D3D_WRAPPERS
diff --git a/test_extensions/CMakeLists.txt b/test_extensions/CMakeLists.txt
deleted file mode 100644
index 3c48e18699..0000000000
--- a/test_extensions/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-set(HARNESS_LIB harness)
-add_subdirectory( media_sharing )
diff --git a/test_extensions/media_sharing/main.cpp b/test_extensions/media_sharing/main.cpp
deleted file mode 100644
index f0c3aff8ac..0000000000
--- a/test_extensions/media_sharing/main.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "harness/testHarness.h"
-#include "utils.h"
-#include "procs.h"
-
-
-test_definition test_list[] = {
-ADD_TEST( context_create ),
-ADD_TEST( get_device_ids ),
-ADD_TEST( api ),
-ADD_TEST( kernel ),
-ADD_TEST( other_data_types ),
-ADD_TEST( memory_access ),
-ADD_TEST( interop_user_sync )
-};
-
-const int test_num = ARRAY_SIZE(test_list);
-
-clGetDeviceIDsFromDX9MediaAdapterKHR_fn clGetDeviceIDsFromDX9MediaAdapterKHR = NULL;
-clCreateFromDX9MediaSurfaceKHR_fn clCreateFromDX9MediaSurfaceKHR = NULL;
-clEnqueueAcquireDX9MediaSurfacesKHR_fn clEnqueueAcquireDX9MediaSurfacesKHR = NULL;
-clEnqueueReleaseDX9MediaSurfacesKHR_fn clEnqueueReleaseDX9MediaSurfacesKHR = NULL;
-
-cl_platform_id gPlatformIDdetected;
-cl_device_id gDeviceIDdetected;
-cl_device_type gDeviceTypeSelected = CL_DEVICE_TYPE_DEFAULT;
-
-bool MediaSurfaceSharingExtensionInit()
-{
-  clGetDeviceIDsFromDX9MediaAdapterKHR = (clGetDeviceIDsFromDX9MediaAdapterKHR_fn)clGetExtensionFunctionAddressForPlatform(gPlatformIDdetected, "clGetDeviceIDsFromDX9MediaAdapterKHR");
-  if (clGetDeviceIDsFromDX9MediaAdapterKHR == NULL)
-  {
-    log_error("clGetExtensionFunctionAddressForPlatform(clGetDeviceIDsFromDX9MediaAdapterKHR) returned NULL.\n");
-    return false;
-  }
-
-  clCreateFromDX9MediaSurfaceKHR = (clCreateFromDX9MediaSurfaceKHR_fn)clGetExtensionFunctionAddressForPlatform(gPlatformIDdetected, "clCreateFromDX9MediaSurfaceKHR");
-  if (clCreateFromDX9MediaSurfaceKHR == NULL)
-  {
-    log_error("clGetExtensionFunctionAddressForPlatform(clCreateFromDX9MediaSurfaceKHR) returned NULL.\n");
-    return false;
-  }
-
-  clEnqueueAcquireDX9MediaSurfacesKHR = (clEnqueueAcquireDX9MediaSurfacesKHR_fn)clGetExtensionFunctionAddressForPlatform(gPlatformIDdetected, "clEnqueueAcquireDX9MediaSurfacesKHR");
-  if (clEnqueueAcquireDX9MediaSurfacesKHR == NULL)
-  {
-    log_error("clGetExtensionFunctionAddressForPlatform(clEnqueueAcquireDX9MediaSurfacesKHR) returned NULL.\n");
-    return false;
-  }
-
-  clEnqueueReleaseDX9MediaSurfacesKHR = (clEnqueueReleaseDX9MediaSurfacesKHR_fn)clGetExtensionFunctionAddressForPlatform(gPlatformIDdetected, "clEnqueueReleaseDX9MediaSurfacesKHR");
-  if (clEnqueueReleaseDX9MediaSurfacesKHR == NULL)
-  {
-    log_error("clGetExtensionFunctionAddressForPlatform(clEnqueueReleaseDX9MediaSurfacesKHR) returned NULL.\n");
-    return false;
-  }
-
-  return true;
-}
-
-bool DetectPlatformAndDevice()
-{
-  std::vector<cl_platform_id> platforms;
-  cl_uint platformsNum = 0;
-  cl_int error = clGetPlatformIDs(0, 0, &platformsNum);
-  if (error != CL_SUCCESS)
-  {
-    print_error(error, "clGetPlatformIDs failed\n");
-    return false;
-  }
-
-  platforms.resize(platformsNum);
-  error = clGetPlatformIDs(platformsNum, &platforms[0], 0);
-  if (error != CL_SUCCESS)
-  {
-    print_error(error, "clGetPlatformIDs failed\n");
-    return false;
-  }
-
-  bool found = false;
-  for (size_t i = 0; i < platformsNum; ++i)
-  {
-    std::vector<cl_device_id> devices;
-    cl_uint devicesNum = 0;
-    error = clGetDeviceIDs(platforms[i], gDeviceTypeSelected, 0, 0, &devicesNum);
-    if (error != CL_SUCCESS)
-    {
-      print_error(error, "clGetDeviceIDs failed\n");
-      return false;
-    }
-
-    devices.resize(devicesNum);
-    error = clGetDeviceIDs(platforms[i], gDeviceTypeSelected, devicesNum, &devices[0], 0);
-    if (error != CL_SUCCESS)
-    {
-      print_error(error, "clGetDeviceIDs failed\n");
-      return false;
-    }
-
-    for (size_t j = 0; j < devicesNum; ++j)
-    {
-      if (is_extension_available(devices[j], "cl_khr_dx9_media_sharing"))
-      {
-        gPlatformIDdetected = platforms[i];
-        gDeviceIDdetected = devices[j];
-        found = true;
-        break;
-      }
-    }
-  }
-
-  if (!found)
-  {
-    log_info("Test was not run, because the media surface sharing extension is not supported for any devices.\n");
-    return false;
-  }
-
-  return true;
-}
-
-bool CmdlineParse(int argc, const char *argv[])
-{
-  char *env_mode = getenv( "CL_DEVICE_TYPE" );
-  if( env_mode != NULL )
-  {
-    if(strcmp(env_mode, "gpu") == 0 || strcmp(env_mode, "CL_DEVICE_TYPE_GPU") == 0)
-      gDeviceTypeSelected = CL_DEVICE_TYPE_GPU;
-    else if(strcmp(env_mode, "cpu") == 0 || strcmp(env_mode, "CL_DEVICE_TYPE_CPU") == 0)
-      gDeviceTypeSelected = CL_DEVICE_TYPE_CPU;
-    else if(strcmp(env_mode, "accelerator") == 0 || strcmp(env_mode, "CL_DEVICE_TYPE_ACCELERATOR") == 0)
-      gDeviceTypeSelected = CL_DEVICE_TYPE_ACCELERATOR;
-    else if(strcmp(env_mode, "default") == 0 || strcmp(env_mode, "CL_DEVICE_TYPE_DEFAULT") == 0)
-      gDeviceTypeSelected = CL_DEVICE_TYPE_DEFAULT;
-    else
-    {
-      log_error("Unknown CL_DEVICE_TYPE env variable setting: %s.\nAborting...\n", env_mode);
-      return false;
-    }
-  }
-
-  for (int i = 0; i < argc; ++i)
-  {
-    if(strcmp(argv[i], "gpu") == 0 || strcmp(argv[i], "CL_DEVICE_TYPE_GPU") == 0)
-    {
-      gDeviceTypeSelected = CL_DEVICE_TYPE_GPU;
-      continue;
-    }
-    else if(strcmp( argv[i], "cpu") == 0 || strcmp(argv[i], "CL_DEVICE_TYPE_CPU") == 0)
-    {
-      gDeviceTypeSelected = CL_DEVICE_TYPE_CPU;
-      continue;
-    }
-    else if(strcmp( argv[i], "accelerator") == 0 || strcmp(argv[i], "CL_DEVICE_TYPE_ACCELERATOR") == 0)
-    {
-      gDeviceTypeSelected = CL_DEVICE_TYPE_ACCELERATOR;
-      continue;
-    }
-    else if(strcmp(argv[i], "CL_DEVICE_TYPE_DEFAULT") == 0)
-    {
-      gDeviceTypeSelected = CL_DEVICE_TYPE_DEFAULT;
-      continue;
-    }
-    else if (strcmp(argv[i], "sw") == 0 || strcmp(argv[i], "software") == 0)
-    {
-      CDeviceWrapper::AccelerationType(CDeviceWrapper::ACCELERATION_SW);
-    }
-  }
-
-  return true;
-}
-
-int main(int argc, const char *argv[])
-{
-  if (!CmdlineParse(argc, argv))
-    return TEST_FAIL;
-
-  if (!DetectPlatformAndDevice())
-  {
-    log_info("Test was not run, because the media surface sharing extension is not supported\n");
-    return TEST_SKIP;
-  }
-
-  if (!MediaSurfaceSharingExtensionInit())
-    return TEST_FAIL;
-
-  return runTestHarness(argc, argv, test_num, test_list, true, 0);
-}
diff --git a/test_extensions/media_sharing/test_create_context.cpp b/test_extensions/media_sharing/test_create_context.cpp
deleted file mode 100644
index 5637bc5423..0000000000
--- a/test_extensions/media_sharing/test_create_context.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "utils.h"
-
-int context_create(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-                   int num_elements, unsigned int width, unsigned int height,
-                   TContextFuncType functionCreate, cl_dx9_media_adapter_type_khr adapterType,
-                   TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
-{
-  CResult result;
-
-  //create device
-  std::auto_ptr<CDeviceWrapper> deviceWrapper;
-  if (!DeviceCreate(adapterType, deviceWrapper))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  //generate input data
-  std::vector<cl_uchar> bufferIn(width * height * 3 / 2, 0);
-  if(!YUVGenerate(surfaceFormat, bufferIn, width, height, 0, 255))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  while (deviceWrapper->AdapterNext())
-  {
-    cl_int error;
-    //check if the test can be run on the adapter
-    if (CL_SUCCESS != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType, deviceWrapper->Device(), result, sharedHandle)))
-    {
-      return result.Result();
-    }
-
-    if (surfaceFormat != SURFACE_FORMAT_NV12 && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
-    {
-      std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-      std::string formatStr;
-      std::string adapterStr;
-      SurfaceFormatToString(surfaceFormat, formatStr);
-      AdapterToString(adapterType, adapterStr);
-      log_info("Skipping test case, image format is not supported by a device (adapter type: %s, format: %s, shared handle: %s)\n",
-        adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
-      return result.Result();
-    }
-
-    void *objectSharedHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surface;
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surface,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectSharedHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    cl_context_properties contextProperties[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatformIDdetected,
-      AdapterTypeToContextInfo(adapterType), (cl_context_properties)deviceWrapper->Device(),
-      0,
-    };
-
-    clContextWrapper ctx;
-    switch(functionCreate)
-    {
-    case CONTEXT_CREATE_DEFAULT:
-      ctx = clCreateContext(&contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
-      break;
-    case CONTEXT_CREATE_FROM_TYPE:
-      ctx = clCreateContextFromType(&contextProperties[0], gDeviceTypeSelected, NULL, NULL, &error);
-      break;
-    default:
-      log_error("Unknown context creation function enum\n");
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-      break;
-    }
-
-    if (error != CL_SUCCESS)
-    {
-      std::string functionName;
-      FunctionContextCreateToString(functionCreate, functionName);
-      log_error("%s failed: %s\n", functionName.c_str(), IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!YUVSurfaceSet(surfaceFormat, surface, bufferIn, width, height))
-    {
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-#if defined(_WIN32)
-    cl_dx9_surface_info_khr surfaceInfo;
-    surfaceInfo.resource = *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
-    surfaceInfo.shared_handle = objectSharedHandle;
-#else
-    void *surfaceInfo = 0;
-    return TEST_NOT_IMPLEMENTED;
-#endif
-
-    std::vector<cl_mem> memObjList;
-    unsigned int planesNum = PlanesNum(surfaceFormat);
-    std::vector<clMemWrapper> planesList(planesNum);
-    for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
-    {
-      planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx, &error);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n", planeIdx, IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-      memObjList.push_back(planesList[planeIdx]);
-    }
-
-    clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(ctx, gDeviceIDdetected, 0, &error );
-    if (error != CL_SUCCESS)
-    {
-      log_error("Unable to create command queue: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!ImageInfoVerify(adapterType, memObjList, width, height, surface, objectSharedHandle))
-    {
-      log_error("Image info verification failed\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    cl_event event;
-    error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()),
-      &memObjList.at(0), 0, NULL, &event);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    cl_uint eventType = 0;
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_TYPE, sizeof(eventType), &eventType, NULL);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clGetEventInfo failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    if(eventType != CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR)
-    {
-      log_error("Invalid event != CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    clReleaseEvent(event);
-
-    size_t origin[3] = {0,0,0};
-    size_t offset = 0;
-    size_t frameSize = width * height * 3 / 2;
-    std::vector<cl_uchar> out( frameSize, 0 );
-    for (size_t i = 0; i < memObjList.size(); ++i)
-    {
-      size_t planeWidth = (i == 0) ? width: width / 2;
-      size_t planeHeight = (i == 0) ? height: height / 2;
-      size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-      error = clEnqueueReadImage(cmdQueue, memObjList.at(i), CL_TRUE, origin, regionPlane, 0, 0, &out.at(offset), 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      offset += planeWidth * planeHeight;
-    }
-
-    if (!YUVCompare(surfaceFormat, out, bufferIn, width, height))
-    {
-      log_error("OCL object verification failed - clEnqueueReadImage\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()),
-      &memObjList.at(0), 0, NULL, &event);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    eventType = 0;
-    error = clGetEventInfo( event, CL_EVENT_COMMAND_TYPE, sizeof(eventType), &eventType, NULL);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clGetEventInfo failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    if(eventType != CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR)
-    {
-      log_error("Invalid event != CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    clReleaseEvent(event);
-
-    //object verification
-    std::vector<cl_uchar> bufferOut(frameSize, 0);
-    if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut, width, height))
-    {
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!YUVCompare(surfaceFormat, bufferOut, bufferIn, width, height))
-    {
-      log_error("Media surface is different than expected\n");
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-  }
-
-  if (deviceWrapper->Status() != DEVICE_PASS)
-  {
-    std::string adapterName;
-    AdapterToString(adapterType, adapterName);
-    if (deviceWrapper->Status() == DEVICE_FAIL)
-    {
-    log_error("%s init failed\n", adapterName.c_str());
-    result.ResultSub(CResult::TEST_FAIL);
-    }
-    else
-    {
-      log_error("%s init incomplete due to unsupported device\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_NOTSUPPORTED);
-    }
-  }
-
-  return result.Result();
-}
-
-int test_context_create(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-  const unsigned int WIDTH = 256;
-  const unsigned int HEIGHT = 256;
-
-  std::vector<cl_dx9_media_adapter_type_khr> adapterTypes;
-#if defined(_WIN32)
-  adapterTypes.push_back(CL_ADAPTER_D3D9_KHR);
-  adapterTypes.push_back(CL_ADAPTER_D3D9EX_KHR);
-  adapterTypes.push_back(CL_ADAPTER_DXVA_KHR);
-#endif
-
-  std::vector<TContextFuncType> contextFuncs;
-  contextFuncs.push_back(CONTEXT_CREATE_DEFAULT);
-  contextFuncs.push_back(CONTEXT_CREATE_FROM_TYPE);
-
-  std::vector<TSurfaceFormat> formats;
-  formats.push_back(SURFACE_FORMAT_NV12);
-  formats.push_back(SURFACE_FORMAT_YV12);
-
-  std::vector<TSharedHandleType> sharedHandleTypes;
-  sharedHandleTypes.push_back(SHARED_HANDLE_DISABLED);
-#if defined(_WIN32)
-  sharedHandleTypes.push_back(SHARED_HANDLE_ENABLED);
-#endif
-
-  CResult result;
-  for (size_t adapterTypeIdx = 0; adapterTypeIdx < adapterTypes.size(); ++adapterTypeIdx)
-  {
-    //iteration through all create context functions
-    for (size_t contextFuncIdx = 0; contextFuncIdx < contextFuncs.size(); ++contextFuncIdx)
-    {
-      //iteration through surface formats
-      for (size_t formatIdx = 0; formatIdx < formats.size(); ++formatIdx)
-      {
-        //shared handle enabled or disabled
-        for (size_t sharedHandleIdx = 0; sharedHandleIdx < sharedHandleTypes.size(); ++sharedHandleIdx)
-        {
-          if (adapterTypes[adapterTypeIdx] == CL_ADAPTER_D3D9_KHR && sharedHandleTypes[sharedHandleIdx] == SHARED_HANDLE_ENABLED)
-            continue;
-
-          if(context_create(deviceID, context, queue, num_elements, WIDTH, HEIGHT,
-            contextFuncs[contextFuncIdx], adapterTypes[adapterTypeIdx], formats[formatIdx],
-            sharedHandleTypes[sharedHandleIdx]) != 0)
-          {
-            std::string sharedHandle = (sharedHandleTypes[sharedHandleIdx] == SHARED_HANDLE_ENABLED)? "shared handle": "no shared handle";
-            std::string formatStr;
-            std::string adapterTypeStr;
-            SurfaceFormatToString(formats[formatIdx], formatStr);
-            AdapterToString(adapterTypes[adapterTypeIdx], adapterTypeStr);
-
-            log_error("\nTest case - clCreateContext (%s, %s, %s) failed\n\n", adapterTypeStr.c_str(), formatStr.c_str(), sharedHandle.c_str());
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-        }
-      }
-    }
-  }
-
-  return result.Result();
-}
diff --git a/test_extensions/media_sharing/test_functions_api.cpp b/test_extensions/media_sharing/test_functions_api.cpp
deleted file mode 100644
index cdc6ce860d..0000000000
--- a/test_extensions/media_sharing/test_functions_api.cpp
+++ /dev/null
@@ -1,617 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "utils.h"
-
-int api_functions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements,
-                  unsigned int iterationNum, unsigned int width, unsigned int height, cl_dx9_media_adapter_type_khr adapterType,
-                  TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
-{
-  const unsigned int FRAME_NUM = 2;
-  const cl_uchar MAX_VALUE = 255 / 2;
-  CResult result;
-
-  //create device
-  std::auto_ptr<CDeviceWrapper> deviceWrapper;
-  if (!DeviceCreate(adapterType, deviceWrapper))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  //generate input and expected data
-  std::vector<std::vector<cl_uchar> > bufferRef1(FRAME_NUM);
-  std::vector<std::vector<cl_uchar> > bufferRef2(FRAME_NUM);
-  std::vector<std::vector<cl_uchar> > bufferRef3(FRAME_NUM);
-  size_t frameSize = width * height * 3 / 2;
-  cl_uchar step = MAX_VALUE / FRAME_NUM;
-  for (size_t i = 0; i < FRAME_NUM; ++i)
-  {
-    if (!YUVGenerate(surfaceFormat, bufferRef1[i], width, height, static_cast<cl_uchar>(step * i), static_cast<cl_uchar>(step * (i + 1))) ||
-        !YUVGenerate(surfaceFormat, bufferRef2[i], width, height, static_cast<cl_uchar>(step * i), static_cast<cl_uchar>(step * (i + 1)), 0.2) ||
-        !YUVGenerate(surfaceFormat, bufferRef3[i], width, height, static_cast<cl_uchar>(step * i), static_cast<cl_uchar>(step * (i + 1)), 0.4))
-    {
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-  }
-
-  //iterates through all devices
-  while (deviceWrapper->AdapterNext())
-  {
-    cl_int error;
-    //check if the test can be run on the adapter
-    if (CL_SUCCESS != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType, deviceWrapper->Device(), result, sharedHandle)))
-    {
-      return result.Result();
-    }
-
-    if (surfaceFormat != SURFACE_FORMAT_NV12 && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
-    {
-      std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-      std::string formatStr;
-      std::string adapterStr;
-      SurfaceFormatToString(surfaceFormat, formatStr);
-      AdapterToString(adapterType, adapterStr);
-      log_info("Skipping test case, image format is not supported by a device (adapter type: %s, format: %s, shared handle: %s)\n",
-        adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
-      return result.Result();
-    }
-
-    void *objectSharedHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surface;
-
-    //create surface
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surface,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectSharedHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    cl_context_properties contextProperties[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatformIDdetected,
-      AdapterTypeToContextInfo(adapterType), (cl_context_properties)deviceWrapper->Device(),
-      0,
-    };
-
-    clContextWrapper ctx = clCreateContext(&contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clCreateContext failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-#if defined(_WIN32)
-    cl_dx9_surface_info_khr surfaceInfo;
-    surfaceInfo.resource = *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
-    surfaceInfo.shared_handle = objectSharedHandle;
-#else
-    void *surfaceInfo = 0;
-    return TEST_NOT_IMPLEMENTED;
-#endif
-
-    std::vector<cl_mem> memObjList;
-    unsigned int planesNum = PlanesNum(surfaceFormat);
-    std::vector<clMemWrapper> planesList(planesNum);
-    for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
-    {
-      planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx, &error);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n", planeIdx, IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-      memObjList.push_back(planesList[planeIdx]);
-    }
-
-    clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(ctx, gDeviceIDdetected, 0, &error );
-    if (error != CL_SUCCESS)
-    {
-      log_error("Unable to create command queue: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!ImageInfoVerify(adapterType, memObjList, width, height, surface, objectSharedHandle))
-    {
-      log_error("Image info verification failed\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    for (size_t frameIdx = 0; frameIdx < iterationNum; ++frameIdx)
-    {
-      if (!YUVSurfaceSet(surfaceFormat, surface, bufferRef1[frameIdx % FRAME_NUM], width, height))
-      {
-        result.ResultSub(CResult::TEST_ERROR);
-        return result.Result();
-      }
-
-      error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      { //read operation
-        std::vector<cl_uchar> out( frameSize, 0 );
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane, 0, 0,
-            &out[offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight;
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef1[frameIdx % FRAME_NUM], width, height))
-        {
-          log_error("Frame idx: %i, OCL image is different then shared OCL object: clEnqueueReadImage\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //write operation
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          error = clEnqueueWriteImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane,
-            0, 0, &bufferRef2[frameIdx % FRAME_NUM][offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueWriteImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight;
-        }
-      }
-
-      { //read operation
-        std::vector<cl_uchar> out( frameSize, 0 );
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane, 0, 0,
-            &out[offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight;
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef2[frameIdx % FRAME_NUM], width, height))
-        {
-          log_error("Frame idx: %i, Shared OCL image verification after clEnqueueWriteImage failed\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //copy operation (shared OCL to OCL)
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        std::vector<cl_uchar> out( frameSize, 0 );
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          cl_image_format formatPlane;
-          formatPlane.image_channel_data_type = CL_UNORM_INT8;
-          formatPlane.image_channel_order = (surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)? CL_RG: CL_R;
-
-          cl_image_desc imageDesc = {0};
-          imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
-          imageDesc.image_width = planeWidth;
-          imageDesc.image_height = planeHeight;
-
-          clMemWrapper planeOCL = clCreateImage(ctx, CL_MEM_READ_WRITE, &formatPlane, &imageDesc, 0, &error);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clCreateImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          error = clEnqueueCopyImage(cmdQueue, memObjList[i], planeOCL, origin, origin, regionPlane, 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueCopyImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          error = clEnqueueReadImage(cmdQueue, planeOCL, CL_TRUE, origin, regionPlane, 0, 0, &out[offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight;
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef2[frameIdx % FRAME_NUM], width, height))
-        {
-          log_error("Frame idx: %i, OCL image verification after clEnqueueCopyImage (from shared OCL to OCL) failed\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //copy operation (OCL to shared OCL)
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        std::vector<cl_uchar> out( frameSize, 0 );
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-          size_t pitchSize = ((surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)? width: planeWidth) * sizeof(cl_uchar);
-
-          cl_image_format formatPlane;
-          formatPlane.image_channel_data_type = CL_UNORM_INT8;
-          formatPlane.image_channel_order = (surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)? CL_RG: CL_R;
-
-          cl_image_desc imageDesc = {0};
-          imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
-          imageDesc.image_width = planeWidth;
-          imageDesc.image_height = planeHeight;
-          imageDesc.image_row_pitch = pitchSize;
-
-          clMemWrapper planeOCL = clCreateImage(ctx, CL_MEM_COPY_HOST_PTR, &formatPlane, &imageDesc, &bufferRef1[frameIdx % FRAME_NUM][offset], &error);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clCreateImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          error = clEnqueueCopyImage(cmdQueue, planeOCL, memObjList[i], origin, origin, regionPlane, 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueCopyImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane, 0, 0, &out[offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight;
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef1[frameIdx % FRAME_NUM], width, height))
-        {
-          log_error("Frame idx: %i, OCL image verification after clEnqueueCopyImage (from OCL to shared OCL) failed\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //copy from image to buffer
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        size_t bufferSize = sizeof(cl_uchar) * frameSize;
-        clMemWrapper buffer = clCreateBuffer( ctx, CL_MEM_READ_WRITE, bufferSize, NULL, &error);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clCreateBuffer failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          error = clEnqueueCopyImageToBuffer(cmdQueue, memObjList[i], buffer, origin, regionPlane, offset, 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueCopyImageToBuffer failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight * sizeof(cl_uchar);
-        }
-
-        std::vector<cl_uchar> out( frameSize, 0 );
-        error = clEnqueueReadBuffer( cmdQueue, buffer, CL_TRUE, 0, bufferSize, &out[0], 0, NULL, NULL );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to read buffer");
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef1[frameIdx % FRAME_NUM], width, height))
-        {
-          log_error("Frame idx: %i, OCL buffer verification after clEnqueueCopyImageToBuffer (from shared OCL image to OCL buffer) failed\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //copy buffer to image
-        size_t bufferSize = sizeof(cl_uchar) * frameSize;
-        clMemWrapper buffer = clCreateBuffer( ctx, CL_MEM_COPY_HOST_PTR, bufferSize, &bufferRef2[frameIdx % FRAME_NUM][0], &error);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clCreateBuffer failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        std::vector<cl_uchar> out( frameSize, 0 );
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          error = clEnqueueCopyBufferToImage(cmdQueue, buffer, memObjList[i], offset, origin, regionPlane, 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueCopyBufferToImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane, 0, 0, &out[offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight * sizeof(cl_uchar);
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef2[frameIdx % FRAME_NUM], width, height))
-        {
-          log_error("Frame idx: %i, OCL image verification after clEnqueueCopyBufferToImage (from OCL buffer to shared OCL image) failed\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //map operation to read
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        std::vector<cl_uchar> out( frameSize, 0 );
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-          size_t pitchSize = ((surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)? width: planeWidth);
-
-          size_t rowPitch = 0;
-          size_t slicePitch = 0;
-          void *mapPtr = clEnqueueMapImage(cmdQueue, memObjList[i], CL_TRUE, CL_MAP_READ, origin, regionPlane,
-            &rowPitch, &slicePitch, 0, 0, 0, &error);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueMapImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          for (size_t y = 0; y < planeHeight; ++y)
-            memcpy(&out[offset + y * pitchSize], static_cast<cl_uchar *>(mapPtr) + y * rowPitch / sizeof(cl_uchar), pitchSize * sizeof(cl_uchar));
-
-          error = clEnqueueUnmapMemObject(cmdQueue, memObjList[i], mapPtr, 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueUnmapMemObject failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += pitchSize * planeHeight;
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef2[frameIdx % FRAME_NUM], width, height))
-        {
-          log_error("Frame idx: %i, Mapped shared OCL image is different then expected\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //map operation to write
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-          size_t pitchSize = ((surfaceFormat == SURFACE_FORMAT_NV12 && i > 0)? width: planeWidth);
-
-          size_t rowPitch = 0;
-          size_t slicePitch = 0;
-          void *mapPtr = clEnqueueMapImage(cmdQueue, memObjList[i], CL_TRUE, CL_MAP_WRITE, origin, regionPlane,
-            &rowPitch, &slicePitch, 0, 0, 0, &error);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueMapImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          for (size_t y = 0; y < planeHeight; ++y)
-            memcpy(static_cast<cl_uchar *>(mapPtr) + y * rowPitch / sizeof(cl_uchar), &bufferRef3[frameIdx % FRAME_NUM][offset + y * pitchSize], pitchSize * sizeof(cl_uchar));
-
-          error = clEnqueueUnmapMemObject(cmdQueue, memObjList[i], mapPtr, 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueUnmapMemObject failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += pitchSize * planeHeight;
-        }
-      }
-
-      error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      std::vector<cl_uchar> bufferOut(frameSize, 0);
-      if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut, width, height))
-      {
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      if (!YUVCompare(surfaceFormat, bufferOut, bufferRef3[frameIdx % FRAME_NUM], width, height))
-      {
-        log_error("Frame idx: %i, media surface is different than expected\n", frameIdx);
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-  }
-
-  if (deviceWrapper->Status() != DEVICE_PASS)
-  {
-    std::string adapterName;
-    AdapterToString(adapterType, adapterName);
-    if (deviceWrapper->Status() == DEVICE_FAIL)
-    {
-      log_error("%s init failed\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-    else
-    {
-      log_error("%s init incomplete due to unsupported device\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_NOTSUPPORTED);
-    }
-  }
-
-  return result.Result();
-}
-
-int test_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-  CResult result;
-
-#if defined(_WIN32)
-  //D3D9
-  if(api_functions(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(api_functions(deviceID, context, queue, num_elements, 3, 512, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //D3D9EX
-  if(api_functions(deviceID, context, queue, num_elements, 5, 256, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(api_functions(deviceID, context, queue, num_elements, 7, 512, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, NV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(api_functions(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(api_functions(deviceID, context, queue, num_elements, 15, 128, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, YV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //DXVA
-  if(api_functions(deviceID, context, queue, num_elements, 20, 128, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(api_functions(deviceID, context, queue, num_elements, 40, 64, 64, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, NV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(api_functions(deviceID, context, queue, num_elements, 5, 512, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(api_functions(deviceID, context, queue, num_elements, 2, 1024, 1024, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, YV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-#else
-  return TEST_NOT_IMPLEMENTED;
-#endif
-
-  return result.Result();
-}
diff --git a/test_extensions/media_sharing/test_functions_kernel.cpp b/test_extensions/media_sharing/test_functions_kernel.cpp
deleted file mode 100644
index f5c3e2daf3..0000000000
--- a/test_extensions/media_sharing/test_functions_kernel.cpp
+++ /dev/null
@@ -1,446 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "harness/errorHelpers.h"
-#include "harness/kernelHelpers.h"
-
-#include "utils.h"
-
-int kernel_functions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements,
-                     unsigned int iterationNum, unsigned int width, unsigned int height, cl_dx9_media_adapter_type_khr adapterType,
-                     TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
-{
-  const unsigned int FRAME_NUM = 2;
-  const cl_uchar MAX_VALUE = 255 / 2;
-  const std::string PROGRAM_STR =
-    "__kernel void TestFunction( read_only image2d_t planeIn, write_only image2d_t planeOut, "
-    NL "                            sampler_t sampler, __global int *planeRes)"
-    NL "{"
-    NL "  int w = get_global_id(0);"
-    NL "  int h = get_global_id(1);"
-    NL "  int width = get_image_width(planeIn);"
-    NL "  int height = get_image_height(planeOut);"
-    NL "  float4 color0 = read_imagef(planeIn, sampler, (int2)(w,h)) + 0.2f;"
-    NL "  float4 color1 = read_imagef(planeIn, sampler, (float2)(w,h)) + 0.2f;"
-    NL "  color0 = (color0 == color1) ? color0: (float4)(0.5, 0.5, 0.5, 0.5);"
-    NL "  write_imagef(planeOut, (int2)(w,h), color0);"
-    NL "  if(w == 0 && h == 0)"
-    NL "  {"
-    NL "    planeRes[0] = width;"
-    NL "    planeRes[1] = height;"
-    NL "  }"
-    NL "}";
-
-  CResult result;
-
-  std::auto_ptr<CDeviceWrapper> deviceWrapper;
-  if (!DeviceCreate(adapterType, deviceWrapper))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  std::vector<std::vector<cl_uchar> > bufferIn(FRAME_NUM);
-  std::vector<std::vector<cl_uchar> > bufferExp(FRAME_NUM);
-  size_t frameSize = width * height * 3 / 2;
-  cl_uchar step = MAX_VALUE / FRAME_NUM;
-  for (size_t i = 0; i < FRAME_NUM; ++i)
-  {
-    if (!YUVGenerate(surfaceFormat, bufferIn[i], width, height, static_cast<cl_uchar>(step * i), static_cast<cl_uchar>(step * (i + 1))) ||
-        !YUVGenerate(surfaceFormat, bufferExp[i], width, height, static_cast<cl_uchar>(step * i), static_cast<cl_uchar>(step * (i + 1)), 0.2))
-    {
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-  }
-
-  while (deviceWrapper->AdapterNext())
-  {
-    cl_int error;
-    //check if the test can be run on the adapter
-    if (CL_SUCCESS != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType, deviceWrapper->Device(), result, sharedHandle)))
-    {
-      return result.Result();
-    }
-
-    if (surfaceFormat != SURFACE_FORMAT_NV12 && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
-    {
-      std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-      std::string formatStr;
-      std::string adapterStr;
-      SurfaceFormatToString(surfaceFormat, formatStr);
-      AdapterToString(adapterType, adapterStr);
-      log_info("Skipping test case, image format is not supported by a device (adapter type: %s, format: %s, shared handle: %s)\n",
-        adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
-      return result.Result();
-    }
-
-    void *objectSrcHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surfaceSrc;
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surfaceSrc,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectSrcHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    void *objectDstHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surfaceDst;
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surfaceDst,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectDstHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    cl_context_properties contextProperties[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatformIDdetected,
-      AdapterTypeToContextInfo(adapterType), (cl_context_properties)deviceWrapper->Device(),
-      0,
-    };
-
-    clContextWrapper ctx = clCreateContext(&contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clCreateContext failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-#if defined(_WIN32)
-    cl_dx9_surface_info_khr surfaceInfoSrc;
-    surfaceInfoSrc.resource = *(static_cast<CD3D9SurfaceWrapper *>(surfaceSrc.get()));
-    surfaceInfoSrc.shared_handle = objectSrcHandle;
-
-    cl_dx9_surface_info_khr surfaceInfoDst;
-    surfaceInfoDst.resource = *(static_cast<CD3D9SurfaceWrapper *>(surfaceDst.get()));
-    surfaceInfoDst.shared_handle = objectDstHandle;
-#else
-    void *surfaceInfoSrc = 0;
-    void *surfaceInfoDst = 0;
-    return TEST_NOT_IMPLEMENTED;
-#endif
-
-    std::vector<cl_mem> memObjSrcList;
-    std::vector<cl_mem> memObjDstList;
-    unsigned int planesNum = PlanesNum(surfaceFormat);
-    std::vector<clMemWrapper> planeSrcList(planesNum);
-    std::vector<clMemWrapper> planeDstList(planesNum);
-    for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
-    {
-      planeSrcList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfoSrc, planeIdx, &error);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n", planeIdx, IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-      memObjSrcList.push_back(planeSrcList[planeIdx]);
-
-      planeDstList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfoDst, planeIdx, &error);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n", planeIdx, IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-      memObjDstList.push_back(planeDstList[planeIdx]);
-    }
-
-    clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(ctx, gDeviceIDdetected, 0, &error );
-    if (error != CL_SUCCESS)
-    {
-      log_error("Unable to create command queue: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!ImageInfoVerify(adapterType, memObjSrcList, width, height, surfaceSrc, objectSrcHandle))
-    {
-      log_error("Image info verification failed\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    for (size_t frameIdx = 0; frameIdx < iterationNum; ++frameIdx)
-    {
-      if (!YUVSurfaceSet(surfaceFormat, surfaceSrc, bufferIn[frameIdx % FRAME_NUM], width, height))
-      {
-        result.ResultSub(CResult::TEST_ERROR);
-        return result.Result();
-      }
-
-      error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjSrcList.size()), &memObjSrcList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjDstList.size()), &memObjDstList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      clSamplerWrapper sampler = clCreateSampler( ctx, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
-      if(error != CL_SUCCESS)
-      {
-        log_error("Unable to create sampler\n");
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      clProgramWrapper program;
-      clKernelWrapper kernel;
-      const char *progPtr = PROGRAM_STR.c_str();
-      if(create_single_kernel_helper(ctx, &program, &kernel, 1, (const char **)&progPtr, "TestFunction"))
-        result.ResultSub(CResult::TEST_FAIL);
-
-      size_t bufferSize = sizeof(cl_int) * 2;
-      clMemWrapper imageRes = clCreateBuffer( ctx, CL_MEM_READ_WRITE, bufferSize, NULL, &error);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clCreateBuffer failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      size_t offset = 0;
-      size_t origin[3] = {0,0,0};
-      std::vector<cl_uchar> out( frameSize, 0 );
-      for (size_t i = 0; i < memObjSrcList.size(); ++i)
-      {
-        size_t planeWidth = (i == 0) ? width: width / 2;
-        size_t planeHeight = (i == 0) ? height: height / 2;
-        size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-        size_t threads[ 2 ] = { planeWidth, planeHeight };
-
-        error = clSetKernelArg( kernel, 0, sizeof( memObjSrcList[i] ), &memObjSrcList[i] );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to set kernel arguments" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clSetKernelArg( kernel, 1, sizeof( memObjDstList[i] ), &memObjDstList[i] );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to set kernel arguments" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clSetKernelArg( kernel, 2, sizeof( sampler ), &sampler );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to set kernel arguments" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clSetKernelArg( kernel, 3, sizeof( imageRes ), &imageRes );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to set kernel arguments" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        size_t localThreads[ 2 ];
-        error = get_max_common_2D_work_group_size( ctx, kernel, threads, localThreads );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to get work group size to use" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clEnqueueNDRangeKernel( cmdQueue, kernel, 2, NULL, threads, localThreads, 0, NULL, NULL );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to execute test kernel" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        std::vector<cl_uint> imageResOut(2, 0);
-        error = clEnqueueReadBuffer( cmdQueue, imageRes, CL_TRUE, 0, bufferSize, &imageResOut[0], 0, NULL, NULL );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to read buffer");
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        if(imageResOut[0] != planeWidth)
-        {
-          log_error("Invalid width value, test = %i, expected = %i\n", imageResOut[0], planeWidth);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        if(imageResOut[1] != planeHeight)
-        {
-          log_error("Invalid height value, test = %i, expected = %i\n", imageResOut[1], planeHeight);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clEnqueueReadImage(cmdQueue, memObjDstList[i], CL_TRUE, origin, regionPlane, 0, 0, &out[offset], 0, 0, 0);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        offset += planeWidth * planeHeight;
-      }
-
-      if (!YUVCompare(surfaceFormat, out, bufferExp[frameIdx % FRAME_NUM], width, height))
-      {
-        log_error("Frame idx: %i, OCL objects are different than expected\n", frameIdx);
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjSrcList.size()), &memObjSrcList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjDstList.size()), &memObjDstList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      std::vector<cl_uchar> bufferOut(frameSize, 0);
-      if (!YUVSurfaceGet(surfaceFormat, surfaceDst, bufferOut, width, height))
-      {
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      if (!YUVCompare(surfaceFormat, bufferOut, bufferExp[frameIdx % FRAME_NUM], width, height))
-      {
-        log_error("Frame idx: %i, media surface is different than expected\n", frameIdx);
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-  }
-
-  if (deviceWrapper->Status() != DEVICE_PASS)
-  {
-    std::string adapterName;
-    AdapterToString(adapterType, adapterName);
-    if (deviceWrapper->Status() == DEVICE_FAIL)
-  {
-      log_error("%s init failed\n", adapterName.c_str());
-    result.ResultSub(CResult::TEST_FAIL);
-    }
-    else
-    {
-      log_error("%s init incomplete due to unsupported device\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_NOTSUPPORTED);
-    }
-  }
-
-  return result.Result();
-}
-
-int test_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-  CResult result;
-
-#if defined(_WIN32)
-  //D3D9
-  if(kernel_functions(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(kernel_functions(deviceID, context, queue, num_elements, 3, 256, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //D3D9EX
-  if(kernel_functions(deviceID, context, queue, num_elements, 5, 256, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(kernel_functions(deviceID, context, queue, num_elements, 7, 512, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, NV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(kernel_functions(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(kernel_functions(deviceID, context, queue, num_elements, 15, 128, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, YV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //DXVA
-  if(kernel_functions(deviceID, context, queue, num_elements, 20, 128, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(kernel_functions(deviceID, context, queue, num_elements, 40, 64, 64, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, NV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(kernel_functions(deviceID, context, queue, num_elements, 5, 512, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(kernel_functions(deviceID, context, queue, num_elements, 2, 1024, 1024, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, YV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-#else
-  return TEST_NOT_IMPLEMENTED;
-#endif
-
-  return result.Result();
-}
diff --git a/test_extensions/media_sharing/test_get_device_ids.cpp b/test_extensions/media_sharing/test_get_device_ids.cpp
deleted file mode 100644
index f8947ea63b..0000000000
--- a/test_extensions/media_sharing/test_get_device_ids.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "utils.h"
-
-int get_device_ids(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements,
-                   cl_dx9_media_adapter_type_khr adapterType)
-{
-  CResult result;
-
-  std::auto_ptr<CDeviceWrapper> deviceWrapper;
-  if (!DeviceCreate(adapterType, deviceWrapper))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  cl_uint devicesExpectedNum = 0;
-  cl_int error = clGetDeviceIDs(gPlatformIDdetected, CL_DEVICE_TYPE_ALL, 0, 0, &devicesExpectedNum);
-  if (error != CL_SUCCESS || devicesExpectedNum < 1)
-  {
-    log_error("clGetDeviceIDs failed: %s\n", IGetErrorString(error));
-    result.ResultSub(CResult::TEST_FAIL);
-    return result.Result();
-  }
-
-  std::vector<cl_device_id> devicesExpected(devicesExpectedNum);
-  error = clGetDeviceIDs(gPlatformIDdetected, CL_DEVICE_TYPE_ALL, devicesExpectedNum, &devicesExpected[0], 0);
-  if (error != CL_SUCCESS)
-  {
-    log_error("clGetDeviceIDs failed: %s\n", IGetErrorString(error));
-    result.ResultSub(CResult::TEST_FAIL);
-    return result.Result();
-  }
-
-  while (deviceWrapper->AdapterNext())
-  {
-    std::vector<cl_dx9_media_adapter_type_khr> mediaAdapterTypes;
-    mediaAdapterTypes.push_back(adapterType);
-
-    std::vector<void *> mediaDevices;
-    mediaDevices.push_back(deviceWrapper->Device());
-
-    //check if the test can be run on the adapter
-    if (CL_SUCCESS != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType, deviceWrapper->Device(), result)))
-    {
-      return result.Result();
-    }
-
-    cl_uint devicesAllNum = 0;
-    error = clGetDeviceIDsFromDX9MediaAdapterKHR(gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
-      CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 0, 0, &devicesAllNum);
-    if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND)
-    {
-      log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    std::vector<cl_device_id> devicesAll;
-    if (devicesAllNum > 0)
-    {
-      devicesAll.resize(devicesAllNum);
-       error = clGetDeviceIDsFromDX9MediaAdapterKHR(gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
-        CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, devicesAllNum, &devicesAll[0], 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-    }
-
-    cl_uint devicesPreferredNum = 0;
-    error = clGetDeviceIDsFromDX9MediaAdapterKHR(gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
-      CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 0, 0, &devicesPreferredNum);
-    if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND)
-    {
-      log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    std::vector<cl_device_id> devicesPreferred;
-    if (devicesPreferredNum > 0)
-    {
-      devicesPreferred.resize(devicesPreferredNum);
-      error = clGetDeviceIDsFromDX9MediaAdapterKHR(gPlatformIDdetected, 1, &mediaAdapterTypes[0], &mediaDevices[0],
-        CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, devicesPreferredNum, &devicesPreferred[0], 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-    }
-
-    if (devicesAllNum < devicesPreferredNum)
-    {
-      log_error("Invalid number of preferred devices. It should be a subset of all devices\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    for (cl_uint i = 0; i < devicesPreferredNum; ++i)
-    {
-      cl_uint j = 0;
-      for (; j < devicesAllNum; ++j)
-      {
-        if (devicesPreferred[i] == devicesAll[j])
-          break;
-      }
-
-      if (j == devicesAllNum)
-      {
-        log_error("Preferred device is not a subset of all devices\n");
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-
-    for (cl_uint i = 0; i < devicesAllNum; ++i)
-    {
-      cl_uint j = 0;
-      for (; j < devicesExpectedNum; ++j)
-      {
-        if (devicesAll[i] == devicesExpected[j])
-          break;
-      }
-
-      if (j == devicesExpectedNum)
-      {
-        log_error("CL_ALL_DEVICES_FOR_MEDIA_ADAPTER_KHR should be a subset of all devices for selected platform\n");
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-  }
-
-  if (deviceWrapper->Status() != DEVICE_PASS)
-  {
-    std::string adapterName;
-    AdapterToString(adapterType, adapterName);
-    if (deviceWrapper->Status() == DEVICE_FAIL)
-  {
-      log_error("%s init failed\n", adapterName.c_str());
-    result.ResultSub(CResult::TEST_FAIL);
-    }
-    else
-    {
-      log_error("%s init incomplete due to unsupported device\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_NOTSUPPORTED);
-    }
-  }
-
-  return result.Result();
-}
-
-int test_get_device_ids(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-  CResult result;
-
-#if defined(_WIN32)
-  if(get_device_ids(deviceID, context, queue, num_elements, CL_ADAPTER_D3D9_KHR) != 0)
-  {
-    log_error("\nTest case (D3D9) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(get_device_ids(deviceID, context, queue, num_elements, CL_ADAPTER_D3D9EX_KHR) != 0)
-  {
-    log_error("\nTest case (D3D9EX) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(get_device_ids(deviceID, context, queue, num_elements, CL_ADAPTER_DXVA_KHR) != 0)
-  {
-    log_error("\nTest case (DXVA) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-#else
-  return TEST_NOT_IMPLEMENTED;
-#endif
-
-  return result.Result();
-}
diff --git a/test_extensions/media_sharing/test_interop_sync.cpp b/test_extensions/media_sharing/test_interop_sync.cpp
deleted file mode 100644
index 6831a14da1..0000000000
--- a/test_extensions/media_sharing/test_interop_sync.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "utils.h"
-
-int interop_user_sync(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-                   int num_elements, unsigned int width, unsigned int height,
-                   TContextFuncType functionCreate, cl_dx9_media_adapter_type_khr adapterType,
-                   TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle, cl_bool userSync)
-{
-  CResult result;
-
-  //create device
-  std::auto_ptr<CDeviceWrapper> deviceWrapper;
-  if (!DeviceCreate(adapterType, deviceWrapper))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  //generate input data
-  std::vector<cl_uchar> bufferIn(width * height * 3 / 2, 0);
-  if(!YUVGenerate(surfaceFormat, bufferIn, width, height, 0, 255))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  while (deviceWrapper->AdapterNext())
-  {
-    cl_int error;
-    //check if the test can be run on the adapter
-    if (CL_SUCCESS != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType, deviceWrapper->Device(), result, sharedHandle)))
-    {
-      return result.Result();
-    }
-
-    if (surfaceFormat != SURFACE_FORMAT_NV12 &&
-      !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
-    {
-      std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-      std::string syncStr = (userSync == CL_TRUE) ? "yes": "no";
-      std::string formatStr;
-      std::string adapterStr;
-      SurfaceFormatToString(surfaceFormat, formatStr);
-      AdapterToString(adapterType, adapterStr);
-      log_info("Skipping test case, image format is not supported by a device (adapter type: %s, format: %s, shared handle: %s, user sync: %s)\n",
-        adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str(), syncStr.c_str());
-      return result.Result();
-    }
-
-    void *objectSharedHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surface;
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surface,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectSharedHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    cl_context_properties contextProperties[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatformIDdetected,
-      AdapterTypeToContextInfo(adapterType), (cl_context_properties)deviceWrapper->Device(),
-      CL_CONTEXT_INTEROP_USER_SYNC, userSync,
-      0,
-    };
-
-
-    clContextWrapper ctx;
-    switch(functionCreate)
-    {
-    case CONTEXT_CREATE_DEFAULT:
-      ctx = clCreateContext(&contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
-      break;
-    case CONTEXT_CREATE_FROM_TYPE:
-      ctx = clCreateContextFromType(&contextProperties[0], gDeviceTypeSelected, NULL, NULL, &error);
-      break;
-    default:
-      log_error("Unknown context creation function enum\n");
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-      break;
-    }
-
-    if (error != CL_SUCCESS)
-    {
-      std::string functionName;
-      FunctionContextCreateToString(functionCreate, functionName);
-      log_error("%s failed: %s\n", functionName.c_str(), IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!YUVSurfaceSet(surfaceFormat, surface, bufferIn, width, height))
-    {
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-#if defined(_WIN32)
-    cl_dx9_surface_info_khr surfaceInfo;
-    surfaceInfo.resource = *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
-    surfaceInfo.shared_handle = objectSharedHandle;
-#else
-    void *surfaceInfo = 0;
-    return TEST_NOT_IMPLEMENTED;
-#endif
-
-    std::vector<cl_mem> memObjList;
-    unsigned int planesNum = PlanesNum(surfaceFormat);
-    std::vector<clMemWrapper> planesList(planesNum);
-    for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
-    {
-      planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx, &error);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clCreateFromDX9MediaSurfaceKHR failed for plane %i: %s\n", planeIdx, IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-      memObjList.push_back(planesList[planeIdx]);
-    }
-
-    clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(ctx, gDeviceIDdetected, 0, &error );
-    if (error != CL_SUCCESS)
-    {
-      log_error("Unable to create command queue: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!ImageInfoVerify(adapterType, memObjList, width, height, surface, objectSharedHandle))
-    {
-      log_error("Image info verification failed\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    if (userSync == CL_TRUE)
-    {
- #if defined(_WIN32)
-      IDirect3DQuery9* eventQuery = NULL;
-      switch (adapterType)
-      {
-      case CL_ADAPTER_D3D9_KHR:
-        {
-          LPDIRECT3DDEVICE9 device = (LPDIRECT3DDEVICE9)deviceWrapper->Device();
-          device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery);
-          eventQuery->Issue(D3DISSUE_END);
-
-          while (S_FALSE == eventQuery->GetData(NULL, 0, D3DGETDATA_FLUSH))
-            ;
-        }
-        break;
-      case CL_ADAPTER_D3D9EX_KHR:
-        {
-          LPDIRECT3DDEVICE9EX device = (LPDIRECT3DDEVICE9EX)deviceWrapper->Device();
-          device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery);
-          eventQuery->Issue(D3DISSUE_END);
-
-          while (S_FALSE == eventQuery->GetData(NULL, 0, D3DGETDATA_FLUSH))
-            ;
-        }
-        break;
-      case CL_ADAPTER_DXVA_KHR:
-        {
-          CDXVAWrapper *DXVADevice = dynamic_cast<CDXVAWrapper *>(&(*deviceWrapper));
-          LPDIRECT3DDEVICE9EX device = (LPDIRECT3DDEVICE9EX)(DXVADevice->D3D9()).Device();
-          device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery);
-          eventQuery->Issue(D3DISSUE_END);
-
-          while (S_FALSE == eventQuery->GetData(NULL, 0, D3DGETDATA_FLUSH))
-            ;
-        }
-        break;
-      default:
-        log_error("Unknown adapter type\n");
-        return false;
-        break;
-      }
-      if(eventQuery)
-      {
-          eventQuery->Release();
-      }
-#else
-      return TEST_NOT_IMPLEMENTED;
-#endif
-    }
-
-    error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList.at(0), 0, 0, 0);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    size_t origin[3] = {0,0,0};
-    size_t offset = 0;
-    size_t frameSize = width * height * 3 / 2;
-    std::vector<cl_uchar> out( frameSize, 0 );
-    for (size_t i = 0; i < memObjList.size(); ++i)
-    {
-      size_t planeWidth = (i == 0) ? width: width / 2;
-      size_t planeHeight = (i == 0) ? height: height / 2;
-      size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-      error = clEnqueueReadImage(cmdQueue, memObjList.at(i), CL_TRUE, origin, regionPlane, 0, 0, &out.at(offset), 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      offset += planeWidth * planeHeight;
-    }
-
-    if (!YUVCompare(surfaceFormat, out, bufferIn, width, height))
-    {
-      log_error("OCL object verification failed - clEnqueueReadImage\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList.at(0), 0, 0, 0);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    if (userSync == CL_TRUE)
-    {
-      error = clFinish(cmdQueue);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clFinish failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-
-    //shared object verification
-    std::vector<cl_uchar> bufferOut(frameSize, 0);
-    if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut, width, height))
-    {
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!YUVCompare(surfaceFormat, bufferOut, bufferIn, width, height))
-    {
-      log_error("Media surface is different than expected\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-  }
-
-  if (deviceWrapper->Status() != DEVICE_PASS)
-  {
-    std::string adapterName;
-    AdapterToString(adapterType, adapterName);
-
-    if (deviceWrapper->Status() == DEVICE_FAIL)
-    {
-    log_error("%s init failed\n", adapterName.c_str());
-    result.ResultSub(CResult::TEST_FAIL);
-    }
-    else
-    {
-      log_error("%s init incomplete due to unsupported device\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_NOTSUPPORTED);
-    }
-  }
-
-  return result.Result();
-}
-
-int test_interop_user_sync(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-  const unsigned int WIDTH = 256;
-  const unsigned int HEIGHT = 256;
-
-  std::vector<cl_dx9_media_adapter_type_khr> adapters;
-#if defined(_WIN32)
-  adapters.push_back(CL_ADAPTER_D3D9_KHR);
-  adapters.push_back(CL_ADAPTER_D3D9EX_KHR);
-  adapters.push_back(CL_ADAPTER_DXVA_KHR);
-#else
-  return TEST_NOT_IMPLEMENTED;
-#endif
-
-  std::vector<TContextFuncType> contextFuncs;
-  contextFuncs.push_back(CONTEXT_CREATE_DEFAULT);
-  contextFuncs.push_back(CONTEXT_CREATE_FROM_TYPE);
-
-  std::vector<TSurfaceFormat> formats;
-  formats.push_back(SURFACE_FORMAT_NV12);
-  formats.push_back(SURFACE_FORMAT_YV12);
-
-  std::vector<TSharedHandleType> sharedHandleTypes;
-  sharedHandleTypes.push_back(SHARED_HANDLE_DISABLED);
-  sharedHandleTypes.push_back(SHARED_HANDLE_ENABLED);
-
-  std::vector<cl_bool> sync;
-  sync.push_back(CL_FALSE);
-  sync.push_back(CL_TRUE);
-
-  CResult result;
-  for (size_t adapterIdx = 0; adapterIdx < adapters.size(); ++adapterIdx)
-  {
-    //iteration through all create context functions
-    for (size_t contextFuncIdx = 0; contextFuncIdx < contextFuncs.size(); ++contextFuncIdx)
-    {
-      //iteration through YUV formats
-      for (size_t formatIdx = 0; formatIdx < formats.size(); ++formatIdx)
-      {
-        //shared handle enabled or disabled
-        for (size_t sharedHandleIdx = 0; sharedHandleIdx < sharedHandleTypes.size(); ++sharedHandleIdx)
-        {
-          //user sync interop disabled or enabled
-          for (size_t syncIdx = 0; syncIdx < sync.size(); ++syncIdx)
-          {
-            if (adapters[adapterIdx] == CL_ADAPTER_D3D9_KHR && sharedHandleTypes[sharedHandleIdx] == SHARED_HANDLE_ENABLED)
-              continue;
-
-            if(interop_user_sync(deviceID, context, queue, num_elements, WIDTH, HEIGHT,
-              contextFuncs[contextFuncIdx], adapters[adapterIdx], formats[formatIdx],
-              sharedHandleTypes[sharedHandleIdx], sync[syncIdx]) != 0)
-            {
-              std::string syncStr = (sync[syncIdx] == CL_TRUE) ? "user sync enabled": "user sync disabled";
-              std::string sharedHandle = (sharedHandleTypes[sharedHandleIdx] == SHARED_HANDLE_ENABLED)? "shared handle": "no shared handle";
-              std::string adapterStr;
-              std::string formatStr;
-              SurfaceFormatToString(formats[formatIdx], formatStr);
-              AdapterToString(adapters[adapterIdx], adapterStr);
-
-              log_error("\nTest case - clCreateContext (%s, %s, %s, %s) failed\n\n", adapterStr.c_str(), formatStr.c_str(), sharedHandle.c_str(), syncStr.c_str());
-              result.ResultSub(CResult::TEST_FAIL);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  return result.Result();
-}
diff --git a/test_extensions/media_sharing/test_memory_access.cpp b/test_extensions/media_sharing/test_memory_access.cpp
deleted file mode 100644
index 5aabaf6f0b..0000000000
--- a/test_extensions/media_sharing/test_memory_access.cpp
+++ /dev/null
@@ -1,468 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "utils.h"
-
-int memory_access(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements,
-                  unsigned int width, unsigned int height, cl_dx9_media_adapter_type_khr adapterType,
-                  TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
-{
-  CResult result;
-
-  std::auto_ptr<CDeviceWrapper> deviceWrapper;
-  //creates device
-  if (!DeviceCreate(adapterType, deviceWrapper))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  //generate input and expected data
-  size_t frameSize = width * height * 3 / 2;
-  std::vector<cl_uchar> bufferRef0(frameSize, 0);
-  std::vector<cl_uchar> bufferRef1(frameSize, 0);
-  std::vector<cl_uchar> bufferRef2(frameSize, 0);
-  if (!YUVGenerate(surfaceFormat, bufferRef0, width, height, 0, 90) ||
-    !YUVGenerate(surfaceFormat, bufferRef1, width, height, 91, 180) ||
-    !YUVGenerate(surfaceFormat, bufferRef2, width, height, 181, 255))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  //iterates through all devices
-  while (deviceWrapper->AdapterNext())
-  {
-    cl_int error;
-    //check if the test can be run on the adapter
-    if (CL_SUCCESS != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType, deviceWrapper->Device(), result, sharedHandle)))
-    {
-      return result.Result();
-    }
-
-    if (surfaceFormat != SURFACE_FORMAT_NV12 && !SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
-    {
-      std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-      std::string formatStr;
-      std::string adapterStr;
-      SurfaceFormatToString(surfaceFormat, formatStr);
-      AdapterToString(adapterType, adapterStr);
-      log_info("Skipping test case, image format is not supported by a device (adapter type: %s, format: %s, shared handle: %s)\n",
-        adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
-      return result.Result();
-    }
-
-    void *objectSharedHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surface;
-
-    //creates surface
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surface,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectSharedHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    if (!YUVSurfaceSet(surfaceFormat, surface, bufferRef0, width, height))
-    {
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    cl_context_properties contextProperties[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatformIDdetected,
-      AdapterTypeToContextInfo(adapterType), (cl_context_properties)deviceWrapper->Device(),
-      0,
-    };
-
-    clContextWrapper ctx = clCreateContext(&contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clCreateContext failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(ctx, gDeviceIDdetected, 0, &error );
-    if (error != CL_SUCCESS)
-    {
-      log_error("Unable to create command queue: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    { //memory access write
-#if defined(_WIN32)
-      cl_dx9_surface_info_khr surfaceInfo;
-      surfaceInfo.resource = *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
-      surfaceInfo.shared_handle = objectSharedHandle;
-#else
-      void *surfaceInfo = 0;
-      return TEST_NOT_IMPLEMENTED;
-#endif
-
-      std::vector<cl_mem> memObjList;
-      unsigned int planesNum = PlanesNum(surfaceFormat);
-      std::vector<clMemWrapper> planesList(planesNum);
-      for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
-      {
-        planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_WRITE_ONLY, adapterType, &surfaceInfo, planeIdx, &error);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clCreateFromDX9MediaSurfaceKHR failed for WRITE_ONLY plane %i: %s\n", planeIdx, IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-          return result.Result();
-        }
-        memObjList.push_back(planesList[planeIdx]);
-      }
-
-      error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      size_t offset = 0;
-      size_t origin[3] = {0,0,0};
-      for (size_t i = 0; i < memObjList.size(); ++i)
-      {
-        size_t planeWidth = (i == 0) ? width: width / 2;
-        size_t planeHeight = (i == 0) ? height: height / 2;
-        size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-        error = clEnqueueWriteImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane,
-          0, 0, &bufferRef1[offset], 0, 0, 0);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clEnqueueWriteImage failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        offset += planeWidth * planeHeight;
-      }
-
-      error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-
-    std::vector<cl_uchar> bufferOut0(frameSize, 0);
-    if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut0, width, height))
-    {
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!YUVCompare(surfaceFormat, bufferOut0, bufferRef1, width, height))
-    {
-      log_error("Media surface is different than expected\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    { //memory access read
-#if defined(_WIN32)
-      cl_dx9_surface_info_khr surfaceInfo;
-      surfaceInfo.resource = *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
-      surfaceInfo.shared_handle = objectSharedHandle;
-#else
-      void *surfaceInfo = 0;
-      return TEST_NOT_IMPLEMENTED;
-#endif
-
-      std::vector<cl_mem> memObjList;
-      unsigned int planesNum = PlanesNum(surfaceFormat);
-      std::vector<clMemWrapper> planesList(planesNum);
-      for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
-      {
-        planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_ONLY, adapterType, &surfaceInfo, planeIdx, &error);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clCreateFromDX9MediaSurfaceKHR failed for READ_ONLY plane %i: %s\n", planeIdx, IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-          return result.Result();
-        }
-        memObjList.push_back(planesList[planeIdx]);
-      }
-
-      error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      std::vector<cl_uchar> out( frameSize, 0 );
-      size_t offset = 0;
-      size_t origin[3] = {0,0,0};
-
-      for (size_t i = 0; i < memObjList.size(); ++i)
-      {
-        size_t planeWidth = (i == 0) ? width: width / 2;
-        size_t planeHeight = (i == 0) ? height: height / 2;
-        size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-        error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane, 0, 0,
-          &out[offset], 0, 0, 0);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        offset += planeWidth * planeHeight;
-      }
-
-      if (!YUVCompare(surfaceFormat, out, bufferRef1, width, height))
-      {
-        log_error("OCL image (READ_ONLY) is different then expected\n");
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-
-    std::vector<cl_uchar> bufferOut1(frameSize, 0);
-    if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut1, width, height))
-    {
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!YUVCompare(surfaceFormat, bufferOut1, bufferRef1, width, height))
-    {
-      log_error("Media surface is different than expected\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    { //memory access read write
-#if defined(_WIN32)
-      cl_dx9_surface_info_khr surfaceInfo;
-      surfaceInfo.resource = *(static_cast<CD3D9SurfaceWrapper *>(surface.get()));
-      surfaceInfo.shared_handle = objectSharedHandle;
-#else
-      void *surfaceInfo = 0;
-      return TEST_NOT_IMPLEMENTED;
-#endif
-
-      std::vector<cl_mem> memObjList;
-      unsigned int planesNum = PlanesNum(surfaceFormat);
-      std::vector<clMemWrapper> planesList(planesNum);
-      for (unsigned int planeIdx = 0; planeIdx < planesNum; ++planeIdx)
-      {
-        planesList[planeIdx] = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceInfo, planeIdx, &error);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clCreateFromDX9MediaSurfaceKHR failed for READ_WRITE plane %i: %s\n", planeIdx, IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-          return result.Result();
-        }
-        memObjList.push_back(planesList[planeIdx]);
-      }
-
-      error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueAcquireDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      { //read
-        std::vector<cl_uchar> out( frameSize, 0 );
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          error = clEnqueueReadImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane, 0, 0,
-            &out[offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight;
-        }
-
-        if (!YUVCompare(surfaceFormat, out, bufferRef1, width, height))
-        {
-          log_error("OCL image (READ_WRITE) is different then expected\n");
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //write
-        size_t offset = 0;
-        size_t origin[3] = {0,0,0};
-        for (size_t i = 0; i < memObjList.size(); ++i)
-        {
-          size_t planeWidth = (i == 0) ? width: width / 2;
-          size_t planeHeight = (i == 0) ? height: height / 2;
-          size_t regionPlane[3] = {planeWidth, planeHeight, 1};
-
-          error = clEnqueueWriteImage(cmdQueue, memObjList[i], CL_TRUE, origin, regionPlane,
-            0, 0, &bufferRef2[offset], 0, 0, 0);
-          if (error != CL_SUCCESS)
-          {
-            log_error("clEnqueueWriteImage failed: %s\n", IGetErrorString(error));
-            result.ResultSub(CResult::TEST_FAIL);
-          }
-
-          offset += planeWidth * planeHeight;
-        }
-      }
-
-      error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReleaseDX9MediaSurfacesKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-
-    std::vector<cl_uchar> bufferOut2(frameSize, 0);
-    if (!YUVSurfaceGet(surfaceFormat, surface, bufferOut2, width, height))
-    {
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!YUVCompare(surfaceFormat, bufferOut2, bufferRef2, width, height))
-    {
-      log_error("Media surface is different than expected\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-  }
-
-  if (deviceWrapper->Status() != DEVICE_PASS)
-  {
-    std::string adapterName;
-    AdapterToString(adapterType, adapterName);
-    if (deviceWrapper->Status() == DEVICE_FAIL)
-    {
-    log_error("%s init failed\n", adapterName.c_str());
-    result.ResultSub(CResult::TEST_FAIL);
-    }
-    else
-    {
-      log_error("%s init incomplete due to unsupported device\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_NOTSUPPORTED);
-    }
-  }
-
-  return result.Result();
-}
-
-int test_memory_access(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-  CResult result;
-
-#if defined(_WIN32)
-  //D3D9
-  if(memory_access(deviceID, context, queue, num_elements, 256, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(memory_access(deviceID, context, queue, num_elements, 512, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //D3D9EX
-  if(memory_access(deviceID, context, queue, num_elements, 256, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(memory_access(deviceID, context, queue, num_elements, 512, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, NV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(memory_access(deviceID, context, queue, num_elements, 256, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(memory_access(deviceID, context, queue, num_elements, 128, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, YV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //DXVA
-  if(memory_access(deviceID, context, queue, num_elements, 128, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, NV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(memory_access(deviceID, context, queue, num_elements, 64, 64, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_NV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, NV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(memory_access(deviceID, context, queue, num_elements, 512, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, YV12, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(memory_access(deviceID, context, queue, num_elements, 1024, 1024, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_YV12, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, YV12, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-#else
-  return TEST_NOT_IMPLEMENTED;
-#endif
-
-  return result.Result();
-}
diff --git a/test_extensions/media_sharing/test_other_data_types.cpp b/test_extensions/media_sharing/test_other_data_types.cpp
deleted file mode 100644
index 8a73866e1e..0000000000
--- a/test_extensions/media_sharing/test_other_data_types.cpp
+++ /dev/null
@@ -1,1023 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "harness/errorHelpers.h"
-#include "harness/imageHelpers.h"
-#include "harness/kernelHelpers.h"
-
-#include "utils.h"
-
-template<typename T>
-int other_data_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements,
-                    unsigned int iterationNum, unsigned int width, unsigned int height, cl_dx9_media_adapter_type_khr adapterType,
-                    TSurfaceFormat surfaceFormat, TSharedHandleType sharedHandle)
-{
-  const unsigned int FRAME_NUM = 2;
-  const float MAX_VALUE = 0.6f;
-  const std::string PROGRAM_STR =
-    "__kernel void TestFunction( read_only image2d_t imageIn, write_only image2d_t imageOut, "
-    NL "                            sampler_t sampler, __global int *imageRes)"
-    NL "{"
-    NL "  int w = get_global_id(0);"
-    NL "  int h = get_global_id(1);"
-    NL "  int width = get_image_width(imageIn);"
-    NL "  int height = get_image_height(imageOut);"
-    NL "  float4 color0 = read_imagef(imageIn, sampler, (int2)(w,h)) - 0.2f;"
-    NL "  float4 color1 = read_imagef(imageIn, sampler, (float2)(w,h)) - 0.2f;"
-    NL "  color0 = (color0 == color1) ? color0: (float4)(0.5, 0.5, 0.5, 0.5);"
-    NL "  write_imagef(imageOut, (int2)(w,h), color0);"
-    NL "  if(w == 0 && h == 0)"
-    NL "  {"
-    NL "    imageRes[0] = width;"
-    NL "    imageRes[1] = height;"
-    NL "  }"
-    NL "}";
-
-  CResult result;
-
-  cl_image_format format;
-  if(!SurfaceFormatToOCL(surfaceFormat, format))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  std::auto_ptr<CDeviceWrapper> deviceWrapper;
-  if (!DeviceCreate(adapterType, deviceWrapper))
-  {
-    result.ResultSub(CResult::TEST_ERROR);
-    return result.Result();
-  }
-
-  while (deviceWrapper->AdapterNext())
-  {
-    cl_int error;
-    //check if the test can be run on the adapter
-    if (CL_SUCCESS != (error = deviceExistForCLTest(gPlatformIDdetected, adapterType, deviceWrapper->Device(), result, sharedHandle)))
-    {
-      return result.Result();
-    }
-
-    cl_context_properties contextProperties[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatformIDdetected,
-      AdapterTypeToContextInfo(adapterType), (cl_context_properties)deviceWrapper->Device(),
-      0,
-    };
-
-    clContextWrapper ctx = clCreateContext(&contextProperties[0], 1, &gDeviceIDdetected, NULL, NULL, &error);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clCreateContext failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    clCommandQueueWrapper cmdQueue = clCreateCommandQueueWithProperties(ctx, gDeviceIDdetected, 0, &error );
-    if (error != CL_SUCCESS)
-    {
-      log_error("Unable to create command queue: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    if (!SurfaceFormatCheck(adapterType, *deviceWrapper, surfaceFormat))
-    {
-      std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-      std::string formatStr;
-      std::string adapterStr;
-      SurfaceFormatToString(surfaceFormat, formatStr);
-      AdapterToString(adapterType, adapterStr);
-      log_info("Skipping test case, image format is not supported by a device (adapter type: %s, format: %s, shared handle: %s)\n",
-        adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
-      return result.Result();
-    }
-
-    if(!ImageFormatCheck(ctx, CL_MEM_OBJECT_IMAGE2D, format))
-    {
-      std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-      std::string formatStr;
-      std::string adapterStr;
-      SurfaceFormatToString(surfaceFormat, formatStr);
-      AdapterToString(adapterType, adapterStr);
-      log_info("Skipping test case, image format is not supported by OCL (adapter type: %s, format: %s, shared handle: %s)\n",
-        adapterStr.c_str(), formatStr.c_str(), sharedHandleStr.c_str());
-      return result.Result();
-    }
-
-    if (format.image_channel_data_type == CL_HALF_FLOAT)
-    {
-      if (DetectFloatToHalfRoundingMode(cmdQueue))
-      {
-        log_error("Unable to detect rounding mode\n");
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-    }
-
-    std::vector<std::vector<T> > bufferIn(FRAME_NUM);
-    std::vector<std::vector<T> > bufferExp(FRAME_NUM);
-    float step = MAX_VALUE / static_cast<float>(FRAME_NUM);
-    unsigned int planeNum = ChannelNum(surfaceFormat);
-    for (size_t i = 0; i < FRAME_NUM; ++i)
-    {
-      DataGenerate(surfaceFormat, format.image_channel_data_type, bufferIn[i], width, height, planeNum, step * i, step * (i + 1));
-      DataGenerate(surfaceFormat, format.image_channel_data_type, bufferExp[i], width, height, planeNum, step * i, step * (i + 1), 0.2f);
-    }
-
-    void *objectSrcHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surfaceSrc;
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surfaceSrc,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectSrcHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-    void *objectDstHandle = 0;
-    std::auto_ptr<CSurfaceWrapper> surfaceDst;
-    if (!MediaSurfaceCreate(adapterType, width, height, surfaceFormat, *deviceWrapper, surfaceDst,
-      (sharedHandle == SHARED_HANDLE_ENABLED) ? true: false, &objectDstHandle))
-    {
-      log_error("Media surface creation failed for %i adapter\n", deviceWrapper->AdapterIdx());
-      result.ResultSub(CResult::TEST_ERROR);
-      return result.Result();
-    }
-
-#if defined(_WIN32)
-    cl_dx9_surface_info_khr surfaceSrcInfo;
-    CD3D9SurfaceWrapper *dx9SurfaceSrc = (static_cast<CD3D9SurfaceWrapper *>(surfaceSrc.get()));
-    surfaceSrcInfo.resource = *dx9SurfaceSrc;
-    surfaceSrcInfo.shared_handle = objectSrcHandle;
-
-    cl_dx9_surface_info_khr surfaceDstInfo;
-    CD3D9SurfaceWrapper *dx9SurfaceDst = (static_cast<CD3D9SurfaceWrapper *>(surfaceDst.get()));
-    surfaceDstInfo.resource = *dx9SurfaceDst;
-    surfaceDstInfo.shared_handle = objectDstHandle;
-#else
-    void *surfaceSrcInfo = 0;
-    void *surfaceDstInfo = 0;
-    return TEST_NOT_IMPLEMENTED;
-#endif
-
-    //create OCL shared object
-    clMemWrapper objectSrcShared = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceSrcInfo, 0, &error);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clCreateFromDX9MediaSurfaceKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    clMemWrapper objectDstShared = clCreateFromDX9MediaSurfaceKHR(ctx, CL_MEM_READ_WRITE, adapterType, &surfaceDstInfo, 0, &error);
-    if (error != CL_SUCCESS)
-    {
-      log_error("clCreateFromDX9MediaSurfaceKHR failed: %s\n", IGetErrorString(error));
-      result.ResultSub(CResult::TEST_FAIL);
-      return result.Result();
-    }
-
-    std::vector<cl_mem> memObjList;
-    memObjList.push_back(objectSrcShared);
-    memObjList.push_back(objectDstShared);
-
-    if (!GetMemObjInfo(objectSrcShared, adapterType, surfaceSrc, objectSrcHandle))
-    {
-      log_error("Invalid memory object info\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    if (!GetImageInfo(objectSrcShared, format, sizeof(T) * planeNum,
-      width * sizeof(T) * planeNum,  0, width, height, 0, 0))
-    {
-      log_error("clGetImageInfo failed\n");
-      result.ResultSub(CResult::TEST_FAIL);
-    }
-
-    for (size_t frameIdx = 0; frameIdx < iterationNum; ++frameIdx)
-    {
-      //surface set
-#if defined(_WIN32)
-      D3DLOCKED_RECT rect;
-      if (FAILED((*dx9SurfaceSrc)->LockRect(&rect, NULL, 0)))
-      {
-        log_error("Surface lock failed\n");
-        result.ResultSub(CResult::TEST_ERROR);
-        return result.Result();
-      }
-
-      size_t pitch = rect.Pitch / sizeof(T);
-      size_t lineSize = width * planeNum * sizeof(T);
-      T *ptr = static_cast<T *>(rect.pBits);
-
-      for (size_t y = 0; y < height; ++y)
-        memcpy(ptr + y * pitch, &bufferIn[frameIdx % FRAME_NUM][y * width * planeNum], lineSize);
-
-      (*dx9SurfaceSrc)->UnlockRect();
-#else
-      void *surfaceInfo = 0;
-      return TEST_NOT_IMPLEMENTED;
-#endif
-
-      error = clEnqueueAcquireDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueAcquireMediaSurfaceKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-        return result.Result();
-      }
-
-      size_t origin[3] = {0,0,0};
-      size_t region[3] = {width, height, 1};
-
-      { //read operation
-        std::vector<T> out( planeNum * width * height, 0 );
-        error = clEnqueueReadImage(cmdQueue, objectSrcShared, CL_TRUE, origin, region, 0, 0, &out[0], 0, 0, 0);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clEnqueueReadImage failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        if (!DataCompare(surfaceFormat, format.image_channel_data_type, out, bufferIn[frameIdx % FRAME_NUM], width, height, planeNum))
-        {
-          log_error("Frame idx: %i, OCL object is different then expected\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //write operation
-        error = clEnqueueWriteImage(cmdQueue, objectSrcShared, CL_TRUE, origin, region,
-          0, 0, &bufferExp[frameIdx % FRAME_NUM][0], 0, 0, 0);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clEnqueueWriteImage failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //kernel operations
-        clSamplerWrapper sampler = clCreateSampler( ctx, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
-        if(error != CL_SUCCESS)
-        {
-          log_error("Unable to create sampler\n");
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        size_t threads[ 2 ] = { width, height };
-        clProgramWrapper program;
-        clKernelWrapper kernel;
-        const char *progPtr = PROGRAM_STR.c_str();
-        if(create_single_kernel_helper(ctx, &program, &kernel, 1, (const char **)&progPtr, "TestFunction"))
-          result.ResultSub(CResult::TEST_FAIL);
-
-        error = clSetKernelArg( kernel, 0, sizeof( objectSrcShared ), &(objectSrcShared) );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to set kernel arguments" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clSetKernelArg( kernel, 1, sizeof( objectDstShared ), &(objectDstShared) );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to set kernel arguments" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clSetKernelArg( kernel, 2, sizeof( sampler ), &sampler );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to set kernel arguments" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        size_t bufferSize = sizeof(cl_int) * 2;
-        clMemWrapper imageRes = clCreateBuffer( ctx, CL_MEM_READ_WRITE, bufferSize, NULL, &error);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clCreateBuffer failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clSetKernelArg( kernel, 3, sizeof( imageRes ), &imageRes );
-
-        size_t localThreads[ 2 ];
-        error = get_max_common_2D_work_group_size( ctx, kernel, threads, localThreads );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to get work group size to use" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        error = clEnqueueNDRangeKernel( cmdQueue, kernel, 2, NULL, threads, localThreads, 0, NULL, NULL );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to execute test kernel" );
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        std::vector<cl_uint> imageResOut(2, 0);
-        error = clEnqueueReadBuffer( cmdQueue, imageRes, CL_TRUE, 0, bufferSize, &imageResOut[0], 0, NULL, NULL );
-        if (error != CL_SUCCESS)
-        {
-          log_error("Unable to read buffer");
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        if(imageResOut[0] != width)
-        {
-          log_error("Invalid width value, test = %i, expected = %i\n", imageResOut[0], width);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        if(imageResOut[1] != height)
-        {
-          log_error("Invalid height value, test = %i, expected = %i\n", imageResOut[1], height);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      { //map operation
-        size_t mapOrigin[3] = {0,0,0};
-        size_t mapRegion[3] = {width, height, 1};
-
-        std::vector<T> out( width * height * planeNum, 0 );
-        size_t rowPitch = 0;
-        size_t slicePitch = 0;
-        void *mapPtr = clEnqueueMapImage(cmdQueue, objectDstShared, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, mapOrigin, mapRegion,
-          &rowPitch, &slicePitch, 0, 0, 0, &error);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clEnqueueMapImage failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        for (size_t y = 0; y < height; ++y)
-          memcpy(&out[y * width * planeNum], static_cast<T *>(mapPtr) + y * rowPitch / sizeof(T),
-          width * planeNum * sizeof(T));
-
-        if (!DataCompare(surfaceFormat, format.image_channel_data_type, out, bufferIn[frameIdx % FRAME_NUM], width, height, planeNum))
-        {
-          log_error("Frame idx: %i, Mapped OCL object is different then expected\n", frameIdx);
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-
-        for (size_t y = 0; y < height; ++y)
-          memcpy(static_cast<T *>(mapPtr) + y * rowPitch / sizeof(T), &bufferExp[frameIdx % FRAME_NUM][y * width * planeNum],
-          width * planeNum * sizeof(T));
-
-        error = clEnqueueUnmapMemObject(cmdQueue, objectDstShared, mapPtr, 0, 0, 0);
-        if (error != CL_SUCCESS)
-        {
-          log_error("clEnqueueUnmapMemObject failed: %s\n", IGetErrorString(error));
-          result.ResultSub(CResult::TEST_FAIL);
-        }
-      }
-
-      error = clEnqueueReleaseDX9MediaSurfacesKHR(cmdQueue, static_cast<cl_uint>(memObjList.size()), &memObjList[0], 0, 0, 0);
-      if (error != CL_SUCCESS)
-      {
-        log_error("clEnqueueReleaseMediaSurfaceKHR failed: %s\n", IGetErrorString(error));
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-
-      std::vector<T> out(width * height * planeNum, 0);
-      //surface get
-#if defined(_WIN32)
-      if (FAILED((*dx9SurfaceDst)->LockRect(&rect, NULL, 0)))
-      {
-        log_error("Surface lock failed\n");
-        result.ResultSub(CResult::TEST_ERROR);
-        return result.Result();
-      }
-
-      pitch = rect.Pitch / sizeof(T);
-      lineSize = width * planeNum * sizeof(T);
-      ptr = static_cast<T *>(rect.pBits);
-      for (size_t y = 0; y < height; ++y)
-        memcpy(&out[y * width * planeNum], ptr + y * pitch, lineSize);
-
-      (*dx9SurfaceDst)->UnlockRect();
-#else
-      return TEST_NOT_IMPLEMENTED;
-#endif
-
-      if (!DataCompare(surfaceFormat, format.image_channel_data_type, out, bufferExp[frameIdx % FRAME_NUM], width, height, planeNum))
-      {
-        log_error("Frame idx: %i, media object is different then expected\n", frameIdx);
-        result.ResultSub(CResult::TEST_FAIL);
-      }
-    }
-  }
-
-  if (deviceWrapper->Status() != DEVICE_PASS)
-  {
-    std::string adapterName;
-    AdapterToString(adapterType, adapterName);
-    if (deviceWrapper->Status() == DEVICE_FAIL)
-  {
-      log_error("%s init failed\n", adapterName.c_str());
-    result.ResultSub(CResult::TEST_FAIL);
-    }
-    else
-    {
-      log_error("%s init incomplete due to unsupported device\n", adapterName.c_str());
-      result.ResultSub(CResult::TEST_NOTSUPPORTED);
-    }
-  }
-
-  return result.Result();
-}
-
-int test_other_data_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-{
-  CResult result;
-
-#if defined(_WIN32)
-  //D3D9
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 64, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 256, 128, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 512, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_L16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, L16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 512, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_A8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, A8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 1024, 32, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_L8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, L8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 32, 1024, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_G32R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, G32R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 64, 64, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_G16R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, G16R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_G16R16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, G16R16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 128, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_A8L8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, A8L8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 128, 512, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_A32B32G32R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, A32B32G32R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 128, 128, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_A16B16G16R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, A16B16G16R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 64, 128, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_A16B16G16R16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, A16B16G16R16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 128, 64, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_A8B8G8R8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, A8B8G8R8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 16, 512, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_X8B8G8R8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, X8B8G8R8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 16, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_A8R8G8B8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, A8R8G8B8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9_KHR,
-    SURFACE_FORMAT_X8R8G8B8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9, X8R8G8B8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //D3D9EX
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 64, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 64, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_R32F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, R32F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 256, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 256, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_R16F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, R16F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 512, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_L16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, L16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 512, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_L16, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, L16, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 1024, 32, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_L8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, L8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 1024, 32, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_L8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, L8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 32, 1024, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_G32R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, G32R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 32, 1024, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_G32R32F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, G32R32F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 64, 64, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_G16R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, G16R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 64, 64, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_G16R16F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, G16R16F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_G16R16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, G16R16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_G16R16, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, G16R16, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8L8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8L8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8L8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8L8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 128, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A32B32G32R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A32B32G32R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 128, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A32B32G32R32F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A32B32G32R32F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 128, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A16B16G16R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A16B16G16R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 128, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A16B16G16R16F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A16B16G16R16F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 64, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A16B16G16R16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A16B16G16R16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 64, 128, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A16B16G16R16, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A16B16G16R16, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 128, 64, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8B8G8R8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8B8G8R8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 128, 64, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8B8G8R8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8B8G8R8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 16, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_X8B8G8R8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, X8B8G8R8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 16, 512, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_X8B8G8R8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, X8B8G8R8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 16, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8R8G8B8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8R8G8B8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 16, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_A8R8G8B8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, A8R8G8B8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_X8R8G8B8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, X8R8G8B8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_D3D9EX_KHR,
-    SURFACE_FORMAT_X8R8G8B8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (D3D9EX, X8R8G8B8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  //DXVA
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 64, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 64, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_R32F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, R32F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 256, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 256, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_R16F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, R16F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 512, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_L16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, L16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 512, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_L16, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, L16, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 1024, 32, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_L8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, L8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 1024, 32, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_L8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, L8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 32, 1024, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_G32R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, G32R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 32, 1024, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_G32R32F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, G32R32F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 64, 64, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_G16R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, G16R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 64, 64, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_G16R16F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, G16R16F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_G16R16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, G16R16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_G16R16, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, G16R16, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8L8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8L8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8L8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8L8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 128, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A32B32G32R32F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A32B32G32R32F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_float>(deviceID, context, queue, num_elements, 10, 128, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A32B32G32R32F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A32B32G32R32F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 128, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A16B16G16R16F, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A16B16G16R16F, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_half>(deviceID, context, queue, num_elements, 10, 128, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A16B16G16R16F, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A16B16G16R16F, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 64, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A16B16G16R16, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A16B16G16R16, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_ushort>(deviceID, context, queue, num_elements, 10, 64, 128, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A16B16G16R16, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A16B16G16R16, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 128, 64, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8B8G8R8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8B8G8R8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 128, 64, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8B8G8R8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8B8G8R8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 16, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_X8B8G8R8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, X8B8G8R8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 16, 512, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_X8B8G8R8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, X8B8G8R8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 16, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8R8G8B8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8R8G8B8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 512, 16, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_A8R8G8B8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, A8R8G8B8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_X8R8G8B8, SHARED_HANDLE_DISABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, X8R8G8B8, no shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-  if(other_data_types<cl_uchar>(deviceID, context, queue, num_elements, 10, 256, 256, CL_ADAPTER_DXVA_KHR,
-    SURFACE_FORMAT_X8R8G8B8, SHARED_HANDLE_ENABLED) != 0)
-  {
-    log_error("\nTest case (DXVA, X8R8G8B8, shared handle) failed\n\n");
-    result.ResultSub(CResult::TEST_FAIL);
-  }
-
-#else
-  return TEST_NOT_IMPLEMENTED;
-#endif
-
-  return result.Result();
-}
diff --git a/test_extensions/media_sharing/utils.cpp b/test_extensions/media_sharing/utils.cpp
deleted file mode 100644
index 3129643203..0000000000
--- a/test_extensions/media_sharing/utils.cpp
+++ /dev/null
@@ -1,1595 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "utils.h"
-
-#include "harness/errorHelpers.h"
-#include "harness/imageHelpers.h"
-#include "harness/rounding_mode.h"
-
-#include <math.h>
-
-#include <CL/cl_half.h>
-
-static RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;
-
-
-CResult::CResult():
-_result(TEST_PASS), _resultLast(TEST_NORESULT)
-{
-
-}
-
-CResult::~CResult()
-{
-
-}
-
-CResult::TTestResult CResult::ResultLast() const
-{
-  return _resultLast;
-}
-
-int CResult::Result() const
-{
-  switch (_result)
-  {
-  case TEST_NORESULT:
-  case TEST_NOTSUPPORTED:
-  case TEST_PASS:
-    return 0;
-    break;
-  case TEST_FAIL:
-    return 1;
-    break;
-  case TEST_ERROR:
-    return 2;
-    break;
-  default:
-    return -1;
-    break;
-  }
-}
-
-void CResult::ResultSub( TTestResult result )
-{
-  _resultLast = result;
-  if (static_cast<int>(result) > static_cast<int>(_result))
-    _result = result;
-}
-
-void FunctionContextCreateToString(TContextFuncType contextCreateFunction, std::string &contextFunction)
-{
-  switch(contextCreateFunction)
-  {
-  case CONTEXT_CREATE_DEFAULT:
-    contextFunction = "CreateContext";
-    break;
-  case CONTEXT_CREATE_FROM_TYPE:
-    contextFunction = "CreateContextFromType";
-    break;
-  default:
-    contextFunction = "Unknown";
-    log_error("FunctionContextCreateToString(): Unknown create function enum!");
-    break;
-  }
-}
-
-void AdapterToString(cl_dx9_media_adapter_type_khr adapterType, std::string &adapter)
-{
-  switch(adapterType)
-  {
-  case CL_ADAPTER_D3D9_KHR:
-    adapter = "D3D9";
-    break;
-  case CL_ADAPTER_D3D9EX_KHR:
-    adapter = "D3D9EX";
-    break;
-  case CL_ADAPTER_DXVA_KHR:
-    adapter = "DXVA";
-    break;
-  default:
-    adapter = "Unknown";
-    log_error("AdapterToString(): Unknown adapter type!");
-    break;
-  }
-}
-
-cl_context_info AdapterTypeToContextInfo( cl_dx9_media_adapter_type_khr adapterType )
-{
-  switch (adapterType)
-  {
-  case CL_ADAPTER_D3D9_KHR:
-    return CL_CONTEXT_ADAPTER_D3D9_KHR;
-    break;
-  case CL_ADAPTER_D3D9EX_KHR:
-    return CL_CONTEXT_ADAPTER_D3D9EX_KHR;
-    break;
-  case CL_ADAPTER_DXVA_KHR:
-    return CL_CONTEXT_ADAPTER_DXVA_KHR;
-    break;
-  default:
-    log_error("AdapterTypeToContextInfo(): Unknown adapter type!");
-    return 0;
-    break;
-  }
-}
-
-void YUVGenerateNV12( std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height,
-                     cl_uchar valueMin, cl_uchar valueMax, double valueAdd )
-{
-  yuv.clear();
-  yuv.resize(width * height * 3 / 2, 0);
-
-  double min = static_cast<double>(valueMin);
-  double max = static_cast<double>(valueMax);
-  double range = 255;
-  double add = static_cast<double>(valueAdd * range);
-  double stepX = (max - min) / static_cast<double>(width);
-  double stepY = (max - min) /static_cast<double>(height);
-
-  //generate Y plane
-  for (unsigned int i = 0; i < height; ++i)
-  {
-    unsigned int offset = i * width;
-    double valueYPlane0 = static_cast<double>(stepY * i);
-    for (unsigned int j = 0; j < width; ++j)
-    {
-      double valueXPlane0 = static_cast<double>(stepX * j);
-      yuv.at(offset + j) = static_cast<cl_uchar>(min + valueXPlane0 / 2 + valueYPlane0 / 2 + add);
-    }
-  }
-
-  //generate UV planes
-  for (unsigned int i = 0; i < height / 2; ++i)
-  {
-    unsigned int offset = width * height + i * width;
-    double valueYPlane1 = static_cast<double>(stepY * i);
-    double valueYPlane2 = static_cast<double>(stepY * (height / 2 + i));
-    for (unsigned int j = 0; j < width / 2; ++j)
-    {
-      double valueXPlane1 = static_cast<double>(stepX * j);
-      double valueXPlane2 = static_cast<double>(stepX * (width / 2 + j));
-
-      yuv.at(offset + j * 2) = static_cast<cl_uchar>(min + valueXPlane1 / 2 + valueYPlane1 / 2 + add);
-      yuv.at(offset + j * 2 + 1) = static_cast<cl_uchar>(min + valueXPlane2 / 2 + valueYPlane2 / 2 + add);
-    }
-  }
-}
-
-void YUVGenerateYV12( std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height, cl_uchar valueMin, cl_uchar valueMax, double valueAdd /*= 0.0*/ )
-{
-  yuv.clear();
-  yuv.resize(width * height * 3 / 2, 0);
-
-  double min = static_cast<double>(valueMin);
-  double max = static_cast<double>(valueMax);
-  double range = 255;
-  double add = static_cast<double>(valueAdd * range);
-  double stepX = (max - min) / static_cast<double>(width);
-  double stepY = (max - min) /static_cast<double>(height);
-
-  unsigned offset = 0;
-
-  //generate Y plane
-  for (unsigned int i = 0; i < height; ++i)
-  {
-    unsigned int plane0Offset = offset + i * width;
-    double valueYPlane0 = static_cast<double>(stepY * i);
-    for (unsigned int j = 0; j < width; ++j)
-    {
-      double valueXPlane0 = static_cast<double>(stepX * j);
-      yuv.at(plane0Offset + j) = static_cast<cl_uchar>(min + valueXPlane0 / 2 + valueYPlane0 / 2 + add);
-    }
-  }
-
-  //generate V plane
-  offset += width * height;
-  for (unsigned int i = 0; i < height / 2; ++i)
-  {
-    unsigned int plane1Offset = offset + i * width / 2;
-    double valueYPlane1 = static_cast<double>(stepY * i);
-    for (unsigned int j = 0; j < width / 2; ++j)
-    {
-      double valueXPlane1 = static_cast<double>(stepX * j);
-      yuv.at(plane1Offset + j) = static_cast<cl_uchar>(min + valueXPlane1 / 2 + valueYPlane1 / 2 + add);
-    }
-  }
-
-  //generate U plane
-  offset += width * height / 4;
-  for (unsigned int i = 0; i < height / 2; ++i)
-  {
-    unsigned int plane2Offset = offset + i * width / 2;
-    double valueYPlane2 = static_cast<double>(stepY * (height / 2 + i));
-    for (unsigned int j = 0; j < width / 2; ++j)
-    {
-      double valueXPlane2 = static_cast<double>(stepX * j);
-      yuv.at(plane2Offset + j) = static_cast<cl_uchar>(min + valueXPlane2 / 2 + valueYPlane2 / 2 + add);
-    }
-  }
-}
-
-
-bool YUVGenerate( TSurfaceFormat surfaceFormat, std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height, cl_uchar valueMin, cl_uchar valueMax, double valueAdd /*= 0.0*/ )
-{
-  switch (surfaceFormat)
-  {
-  case SURFACE_FORMAT_NV12:
-    YUVGenerateNV12(yuv, width, height, valueMin, valueMax, valueAdd);
-    break;
-  case SURFACE_FORMAT_YV12:
-    YUVGenerateYV12(yuv, width, height, valueMin, valueMax, valueAdd);
-    break;
-  default:
-    log_error("YUVGenerate(): Invalid surface type\n");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-bool YUVSurfaceSetNV12( std::auto_ptr<CSurfaceWrapper> &surface, const std::vector<cl_uchar> &yuv,
-                       unsigned int width, unsigned int height )
-{
-#if defined(_WIN32)
-  CD3D9SurfaceWrapper *d3dSurface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-  D3DLOCKED_RECT rect;
-  if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
-  {
-    log_error("YUVSurfaceSetNV12(): Surface lock failed\n");
-    return false;
-  }
-
-  size_t pitch = rect.Pitch / sizeof(cl_uchar);
-  size_t lineSize = width * sizeof(cl_uchar);
-  cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
-  for (size_t y = 0; y < height; ++y)
-    memcpy(ptr + y * pitch, &yuv.at(y * width), lineSize);
-
-  for (size_t y = 0; y < height / 2; ++y)
-    memcpy(ptr + height * pitch + y * pitch, &yuv.at(width * height + y * width), lineSize);
-
-  (*d3dSurface)->UnlockRect();
-
-  return true;
-
-#else
-  return false;
-#endif
-}
-
-bool YUVSurfaceSetYV12( std::auto_ptr<CSurfaceWrapper> &surface, const std::vector<cl_uchar> &yuv,
-                       unsigned int width, unsigned int height )
-{
-#if defined(_WIN32)
-  CD3D9SurfaceWrapper *d3dSurface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-  D3DLOCKED_RECT rect;
-  if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
-  {
-    log_error("YUVSurfaceSetYV12(): Surface lock failed!\n");
-    return false;
-  }
-
-  size_t pitch = rect.Pitch / sizeof(cl_uchar);
-  size_t pitchHalf = pitch / 2;
-  size_t lineSize = width * sizeof(cl_uchar);
-  size_t lineHalfSize = lineSize / 2;
-  size_t surfaceOffset = 0;
-  size_t yuvOffset = 0;
-  cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
-
-  for (size_t y = 0; y < height; ++y)
-    memcpy(ptr + surfaceOffset + y * pitch, &yuv.at(yuvOffset + y * width), lineSize);
-
-  surfaceOffset += height * pitch;
-  yuvOffset += width * height;
-  for (size_t y = 0; y < height / 2; ++y)
-    memcpy(ptr + surfaceOffset + y * pitchHalf, &yuv.at(yuvOffset + y * lineHalfSize), lineHalfSize);
-
-  surfaceOffset += pitchHalf * height / 2;
-  yuvOffset += width * height / 4;
-  for (size_t y = 0; y < height / 2; ++y)
-    memcpy(ptr + surfaceOffset + y * pitchHalf, &yuv.at(yuvOffset + y * lineHalfSize), lineHalfSize);
-
-  (*d3dSurface)->UnlockRect();
-
-  return true;
-
-#else
-  return false;
-#endif
-}
-
-bool YUVSurfaceSet(TSurfaceFormat surfaceFormat, std::auto_ptr<CSurfaceWrapper> &surface, const std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height )
-{
-  switch (surfaceFormat)
-  {
-  case SURFACE_FORMAT_NV12:
-    if(!YUVSurfaceSetNV12(surface, yuv, width, height))
-      return false;
-    break;
-  case SURFACE_FORMAT_YV12:
-    if(!YUVSurfaceSetYV12(surface, yuv, width, height))
-      return false;
-    break;
-  default:
-    log_error("YUVSurfaceSet(): Invalid surface type!\n");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-bool YUVSurfaceGetNV12( std::auto_ptr<CSurfaceWrapper> &surface, std::vector<cl_uchar> &yuv,
-                       unsigned int width, unsigned int height )
-{
-#if defined(_WIN32)
-  CD3D9SurfaceWrapper *d3dSurface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-  D3DLOCKED_RECT rect;
-  if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
-  {
-    log_error("YUVSurfaceGetNV12(): Surface lock failed!\n");
-    return false;
-  }
-
-  size_t pitch = rect.Pitch / sizeof(cl_uchar);
-  size_t lineSize = width * sizeof(cl_uchar);
-  cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
-  size_t yuvOffset = 0;
-  size_t surfaceOffset = 0;
-  for (size_t y = 0; y < height; ++y)
-    memcpy(&yuv.at(yuvOffset + y * width), ptr + y * pitch, lineSize);
-
-  yuvOffset += width * height;
-  surfaceOffset += pitch * height;
-  for (size_t y = 0; y < height / 2; ++y)
-    memcpy(&yuv.at(yuvOffset + y * width), ptr + surfaceOffset + y * pitch, lineSize);
-
-  (*d3dSurface)->UnlockRect();
-
-  return true;
-
-#else
-  return false;
-#endif
-}
-
-bool YUVSurfaceGetYV12( std::auto_ptr<CSurfaceWrapper> &surface, std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height )
-{
-#if defined(_WIN32)
-  CD3D9SurfaceWrapper *d3dSurface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-  D3DLOCKED_RECT rect;
-  if (FAILED((*d3dSurface)->LockRect(&rect, NULL, 0)))
-  {
-    log_error("YUVSurfaceGetYV12(): Surface lock failed!\n");
-    return false;
-  }
-
-  size_t pitch = rect.Pitch / sizeof(cl_uchar);
-  size_t pitchHalf = pitch / 2;
-  size_t lineSize = width * sizeof(cl_uchar);
-  size_t lineHalfSize = lineSize / 2;
-  size_t surfaceOffset = 0;
-  size_t yuvOffset = 0;
-  cl_uchar *ptr = static_cast<cl_uchar *>(rect.pBits);
-
-  for (size_t y = 0; y < height; ++y)
-    memcpy(&yuv.at(yuvOffset + y * width), ptr + surfaceOffset + y * pitch, lineSize);
-
-  surfaceOffset += pitch * height;
-  yuvOffset += width * height;
-  for (size_t y = 0; y < height / 2; ++y)
-    memcpy(&yuv.at(yuvOffset + y * lineHalfSize), ptr + surfaceOffset + y * pitchHalf, lineHalfSize);
-
-  surfaceOffset += pitchHalf * height / 2;
-  yuvOffset += width * height / 4;
-  for (size_t y = 0; y < height / 2; ++y)
-    memcpy(&yuv.at(yuvOffset + y * lineHalfSize), ptr + surfaceOffset + y * pitchHalf, lineHalfSize);
-
-  (*d3dSurface)->UnlockRect();
-
-  return true;
-
-#else
-  return false;
-#endif
-}
-
-bool YUVSurfaceGet(TSurfaceFormat surfaceFormat, std::auto_ptr<CSurfaceWrapper> &surface, std::vector<cl_uchar> &yuv,
-                   unsigned int width, unsigned int height )
-{
-  switch (surfaceFormat)
-  {
-  case SURFACE_FORMAT_NV12:
-    if(!YUVSurfaceGetNV12(surface, yuv, width, height))
-      return false;
-    break;
-  case SURFACE_FORMAT_YV12:
-    if(!YUVSurfaceGetYV12(surface, yuv, width, height))
-      return false;
-    break;
-  default:
-    log_error("YUVSurfaceGet(): Invalid surface type!\n");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-bool YUVCompareNV12( const std::vector<cl_uchar> &yuvTest, const std::vector<cl_uchar> &yuvRef,
-                    unsigned int width, unsigned int height )
-{
-  //plane 0 verification
-  size_t offset = 0;
-  for (size_t y = 0; y < height; ++y)
-  {
-    size_t plane0Offset = offset + width * y;
-    for (size_t x = 0; x < width; ++x)
-    {
-      if (yuvTest[plane0Offset + x] != yuvRef[plane0Offset + x])
-      {
-        log_error("Plane 0 (Y) is different than expected, reference value: %i, test value: %i, x: %i, y: %i\n",
-          yuvRef[plane0Offset + x], yuvTest[plane0Offset + x], x, y);
-        return false;
-      }
-    }
-  }
-
-  //plane 1 and 2 verification
-  offset += width * height;
-  for (size_t y = 0; y < height / 2; ++y)
-  {
-    size_t plane12Offset = offset + width * y;
-    for (size_t x = 0; x < width / 2; ++x)
-    {
-      if (yuvTest.at(plane12Offset + 2 * x) != yuvRef.at(plane12Offset + 2 * x))
-      {
-        log_error("Plane 1 (U) is different than expected, reference value: %i, test value: %i, x: %i, y: %i\n",
-          yuvRef[plane12Offset + 2 * x], yuvTest[plane12Offset + 2 * x], x, y);
-        return false;
-      }
-
-      if (yuvTest.at(plane12Offset + 2 * x + 1) != yuvRef.at(plane12Offset + 2 * x + 1))
-      {
-        log_error("Plane 2 (V) is different than expected, reference value: %i, test value: %i, x: %i, y: %i\n",
-          yuvRef[plane12Offset + 2 * x + 1], yuvTest[plane12Offset + 2 * x + 1], x, y);
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool YUVCompareYV12( const std::vector<cl_uchar> &yuvTest, const std::vector<cl_uchar> &yuvRef,
-                    unsigned int width, unsigned int height )
-{
-  //plane 0 verification
-  size_t offset = 0;
-  for (size_t y = 0; y < height; ++y)
-  {
-    size_t plane0Offset = width * y;
-    for (size_t x = 0; x < width; ++x)
-    {
-      if (yuvTest.at(plane0Offset + x) != yuvRef.at(plane0Offset + x))
-      {
-        log_error("Plane 0 (Y) is different than expected, reference value: %i, test value: %i, x: %i, y: %i\n",
-          yuvRef[plane0Offset + x], yuvTest[plane0Offset + x], x ,y);
-        return false;
-      }
-    }
-  }
-
-  //plane 1 verification
-  offset += width * height;
-  for (size_t y = 0; y < height / 2; ++y)
-  {
-    size_t plane1Offset = offset + width * y / 2;
-    for (size_t x = 0; x < width / 2; ++x)
-    {
-      if (yuvTest.at(plane1Offset + x) != yuvRef.at(plane1Offset + x))
-      {
-        log_error("Plane 1 (V) is different than expected, reference value: %i, test value: %i, x: %i, y: %i\n",
-          yuvRef[plane1Offset + x], yuvTest[plane1Offset + x], x, y);
-        return false;
-      }
-    }
-  }
-
-  //plane 2 verification
-  offset += width * height / 4;
-  for (size_t y = 0; y < height / 2; ++y)
-  {
-    size_t plane2Offset = offset + width * y / 2;
-    for (size_t x = 0; x < width / 2; ++x)
-    {
-      if (yuvTest.at(plane2Offset + x) != yuvRef.at(plane2Offset + x))
-      {
-        log_error("Plane 2 (U) is different than expected, reference value: %i, test value: %i, x: %i, y: %i\n",
-          yuvRef[plane2Offset + x], yuvTest[plane2Offset + x], x, y);
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool YUVCompare( TSurfaceFormat surfaceFormat, const std::vector<cl_uchar> &yuvTest, const std::vector<cl_uchar> &yuvRef,
-                unsigned int width, unsigned int height )
-{
-  switch (surfaceFormat)
-  {
-  case SURFACE_FORMAT_NV12:
-    if (!YUVCompareNV12(yuvTest, yuvRef, width, height))
-    {
-      log_error("OCL object is different than expected!\n");
-      return false;
-    }
-    break;
-  case SURFACE_FORMAT_YV12:
-    if (!YUVCompareYV12(yuvTest, yuvRef, width, height))
-    {
-      log_error("OCL object is different than expected!\n");
-      return false;
-    }
-    break;
-  default:
-    log_error("YUVCompare(): Invalid surface type!\n");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-void DataGenerate( TSurfaceFormat surfaceFormat, cl_channel_type type, std::vector<float> &data, unsigned int width, unsigned int height,
-                  unsigned int channelNum, float cmin /*= 0.0f*/, float cmax /*= 1.0f*/, float add /*= 0.0f*/ )
-{
-  data.clear();
-  data.reserve(width * height * channelNum);
-
-  double valueMin = static_cast<double>(cmin);
-  double valueMax = static_cast<double>(cmax);
-  double stepX = (valueMax - valueMin) / static_cast<double>(width);
-  double stepY = (valueMax - valueMin) /static_cast<double>(height);
-  double valueAdd = static_cast<double>(add);
-  for (unsigned int i = 0; i < height; ++i)
-  {
-    double valueY = static_cast<double>(stepY * i);
-    for (unsigned int j = 0; j < width; ++j)
-    {
-      double valueX = static_cast<double>(stepX * j);
-      switch (channelNum)
-      {
-      case 1:
-        data.push_back(static_cast<float>(valueMin + valueX / 2 + valueY / 2 + valueAdd));
-        break;
-      case 2:
-        data.push_back(static_cast<float>(valueMin + valueX + valueAdd));
-        data.push_back(static_cast<float>(valueMin + valueY + valueAdd));
-        break;
-      case 4:
-        data.push_back(static_cast<float>(valueMin + valueX + valueAdd));
-        data.push_back(static_cast<float>(valueMin + valueY + valueAdd));
-        data.push_back(static_cast<float>(valueMin + valueX / 2 + valueAdd));
-        data.push_back(static_cast<float>(valueMin + valueY / 2 + valueAdd));
-        break;
-      default:
-        log_error("DataGenerate(): invalid channel number!");
-        return;
-        break;
-      }
-    }
-  }
-}
-
-void DataGenerate( TSurfaceFormat surfaceFormat, cl_channel_type type, std::vector<cl_half> &data, unsigned int width, unsigned int height,
-                  unsigned int channelNum, float cmin /*= 0.0f*/, float cmax /*= 1.0f*/, float add /*= 0.0f*/ )
-{
-  data.clear();
-  data.reserve(width * height * channelNum);
-
-  double valueMin = static_cast<double>(cmin);
-  double valueMax = static_cast<double>(cmax);
-  double stepX = (valueMax - valueMin) / static_cast<double>(width);
-  double stepY = (valueMax - valueMin) /static_cast<double>(height);
-
-  switch(type)
-  {
-  case CL_HALF_FLOAT:
-    {
-      double valueAdd = static_cast<double>(add);
-
-      for (unsigned int i = 0; i < height; ++i)
-      {
-        double valueY = static_cast<double>(stepY * i);
-        for (unsigned int j = 0; j < width; ++j)
-        {
-          double valueX = static_cast<double>(stepX * j);
-          switch (channelNum)
-          {
-          case 1:
-            data.push_back(convert_float_to_half(static_cast<float>(valueMin + valueX / 2 + valueY / 2 + valueAdd)));
-            break;
-          case 2:
-            data.push_back(convert_float_to_half(static_cast<float>(valueMin + valueX + valueAdd)));
-            data.push_back(convert_float_to_half(static_cast<float>(valueMin + valueY + valueAdd)));
-            break;
-          case 4:
-            data.push_back(convert_float_to_half(static_cast<float>(valueMin + valueX + valueAdd)));
-            data.push_back(convert_float_to_half(static_cast<float>(valueMin + valueY + valueAdd)));
-            data.push_back(convert_float_to_half(static_cast<float>(valueMin + valueX / 2 + valueAdd)));
-            data.push_back(convert_float_to_half(static_cast<float>(valueMin + valueY / 2 + valueAdd)));
-            break;
-          default:
-            log_error("DataGenerate(): invalid channel number!");
-            return;
-            break;
-          }
-        }
-      }
-      break;
-    }
-  case CL_UNORM_INT16:
-    {
-      double range = 65535;
-      double valueAdd = static_cast<double>(add * range);
-
-      for (unsigned int i = 0; i < height; ++i)
-      {
-        double valueY = static_cast<double>(stepY * i * range);
-        for (unsigned int j = 0; j < width; ++j)
-        {
-          double valueX = static_cast<double>(stepX * j * range);
-          switch (channelNum)
-          {
-          case 1:
-            data.push_back(static_cast<cl_ushort>(valueMin + valueX / 2 + valueY / 2 + valueAdd));
-            break;
-          case 2:
-            data.push_back(static_cast<cl_ushort>(valueMin + valueX + valueAdd));
-            data.push_back(static_cast<cl_ushort>(valueMin + valueY + valueAdd));
-            break;
-          case 4:
-            data.push_back(static_cast<cl_ushort>(valueMin + valueX + valueAdd));
-            data.push_back(static_cast<cl_ushort>(valueMin + valueY + valueAdd));
-            data.push_back(static_cast<cl_ushort>(valueMin + valueX / 2 + valueAdd));
-            data.push_back(static_cast<cl_ushort>(valueMin + valueY / 2 + valueAdd));
-            break;
-          default:
-            log_error("DataGenerate(): invalid channel number!");
-            return;
-            break;
-          }
-        }
-      }
-    }
-    break;
-  default:
-    log_error("DataGenerate(): unknown data type!");
-    return;
-    break;
-  }
-}
-
-void DataGenerate( TSurfaceFormat surfaceFormat, cl_channel_type type, std::vector<cl_uchar> &data, unsigned int width, unsigned int height,
-                  unsigned int channelNum, float cmin /*= 0.0f*/, float cmax /*= 1.0f*/, float add /*= 0.0f*/ )
-{
-  data.clear();
-  data.reserve(width * height * channelNum);
-
-  double valueMin = static_cast<double>(cmin);
-  double valueMax = static_cast<double>(cmax);
-  double stepX = (valueMax - valueMin) / static_cast<double>(width);
-  double stepY = (valueMax - valueMin) /static_cast<double>(height);
-
-  double range = 255;
-  double valueAdd = static_cast<double>(add * range);
-
-  for (unsigned int i = 0; i < height; ++i)
-  {
-    double valueY = static_cast<double>(stepY * i * range);
-    for (unsigned int j = 0; j < width; ++j)
-    {
-      double valueX = static_cast<double>(stepX * j * range);
-      switch (channelNum)
-      {
-      case 1:
-        data.push_back(static_cast<cl_uchar>(valueMin + valueX / 2 + valueY / 2 + valueAdd));
-        break;
-      case 2:
-        data.push_back(static_cast<cl_uchar>(valueMin + valueX + valueAdd));
-        data.push_back(static_cast<cl_uchar>(valueMin + valueY + valueAdd));
-        break;
-      case 4:
-        data.push_back(static_cast<cl_uchar>(valueMin + valueX + valueAdd));
-        data.push_back(static_cast<cl_uchar>(valueMin + valueY + valueAdd));
-        data.push_back(static_cast<cl_uchar>(valueMin + valueX / 2 + valueAdd));
-        if (surfaceFormat == SURFACE_FORMAT_X8R8G8B8)
-          data.push_back(static_cast<cl_uchar>(0xff));
-        else
-          data.push_back(static_cast<cl_uchar>(valueMin + valueY / 2 + valueAdd));
-        break;
-      default:
-        log_error("DataGenerate(): invalid channel number!");
-        return;
-        break;
-      }
-    }
-  }
-}
-
-bool DataCompare( TSurfaceFormat surfaceFormat, cl_channel_type type, const std::vector<float> &dataTest, const std::vector<float> &dataExp,
-                 unsigned int width, unsigned int height, unsigned int channelNum)
-{
-  float epsilon = 0.000001f;
-  for (unsigned int i = 0; i < height; ++i)
-  {
-    unsigned int offset = i * width * channelNum;
-    for (unsigned int j = 0; j < width; ++j)
-    {
-      for(unsigned planeIdx = 0; planeIdx < channelNum; ++planeIdx)
-      {
-        if (abs(dataTest.at(offset + j * channelNum + planeIdx) - dataExp.at(offset + j * channelNum + planeIdx)) > epsilon)
-        {
-          log_error("Tested image is different than reference (x,y,plane) = (%i,%i,%i), test value = %f, expected value = %f\n",
-            j, i, planeIdx, dataTest[offset + j * channelNum + planeIdx], dataExp[offset + j * channelNum + planeIdx]);
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
-bool DataCompare( TSurfaceFormat surfaceFormat, cl_channel_type type, const std::vector<cl_half> &dataTest, const std::vector<cl_half> &dataExp,
-                 unsigned int width, unsigned int height, unsigned int channelNum)
-{
-  switch(type)
-  {
-  case CL_HALF_FLOAT:
-    {
-      float epsilon = 0.001f;
-      for (unsigned int i = 0; i < height; ++i)
-      {
-        unsigned int offset = i * width * channelNum;
-        for (unsigned int j = 0; j < width; ++j)
-        {
-          for(unsigned planeIdx = 0; planeIdx < channelNum; ++planeIdx)
-          {
-              float test = cl_half_to_float(
-                  dataTest.at(offset + j * channelNum + planeIdx));
-              float ref = cl_half_to_float(
-                  dataExp.at(offset + j * channelNum + planeIdx));
-              if (abs(test - ref) > epsilon)
-              {
-                  log_error(
-                      "Tested image is different than reference (x,y,plane) = "
-                      "(%i,%i,%i), test value = %f, expected value = %f\n",
-                      j, i, planeIdx, test, ref);
-                  return false;
-              }
-          }
-        }
-      }
-    }
-    break;
-  case CL_UNORM_INT16:
-    {
-      cl_ushort epsilon = 1;
-      for (unsigned int i = 0; i < height; ++i)
-      {
-        unsigned int offset = i * width * channelNum;
-        for (unsigned int j = 0; j < width; ++j)
-        {
-          for(unsigned planeIdx = 0; planeIdx < channelNum; ++planeIdx)
-          {
-            cl_ushort test = dataTest.at(offset + j * channelNum + planeIdx);
-            cl_ushort ref = dataExp.at(offset + j * channelNum + planeIdx);
-            if (abs(test - ref) > epsilon)
-            {
-              log_error("Tested image is different than reference (x,y,plane) = (%i,%i,%i), test value = %i, expected value = %i\n", j, i, planeIdx, test, ref);
-              return false;
-            }
-          }
-        }
-      }
-    }
-    break;
-  default:
-    log_error("DataCompare(): Invalid data format!");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-bool DataCompare( TSurfaceFormat surfaceFormat, cl_channel_type type, const std::vector<cl_uchar> &dataTest, const std::vector<cl_uchar> &dataExp,
-                 unsigned int width, unsigned int height, unsigned int planeNum )
-{
-  for (unsigned int i = 0; i < height; ++i)
-  {
-    unsigned int offset = i * width * planeNum;
-    for (unsigned int j = 0; j < width; ++j)
-    {
-      for(unsigned planeIdx = 0; planeIdx < planeNum; ++planeIdx)
-      {
-        if (surfaceFormat == SURFACE_FORMAT_X8R8G8B8 && planeIdx == 3)
-          continue;
-
-        cl_uchar test = dataTest.at(offset + j * planeNum + planeIdx);
-        cl_uchar ref = dataExp.at(offset + j * planeNum + planeIdx);
-        if (test != ref)
-        {
-          log_error("Tested image is different than reference (x,y,plane) = (%i,%i,%i), test value = %i, expected value = %i\n",
-            j, i, planeIdx, test, ref);
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
-bool GetImageInfo( cl_mem object, cl_image_format formatExp, size_t elementSizeExp, size_t rowPitchExp,
-                  size_t slicePitchExp, size_t widthExp, size_t heightExp, size_t depthExp , unsigned int planeExp)
-{
-  bool result = true;
-
-  cl_image_format format;
-  if (clGetImageInfo(object, CL_IMAGE_FORMAT, sizeof(cl_image_format), &format, 0) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_FORMAT) failed\n");
-    result = false;
-  }
-
-  if (formatExp.image_channel_order != format.image_channel_order || formatExp.image_channel_data_type != format.image_channel_data_type)
-  {
-    log_error("Value of CL_IMAGE_FORMAT is different than expected\n");
-    result = false;
-  }
-
-  size_t elementSize = 0;
-  if (clGetImageInfo(object, CL_IMAGE_ELEMENT_SIZE, sizeof(size_t), &elementSize, 0) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_ELEMENT_SIZE) failed\n");
-    result = false;
-  }
-
-  if (elementSizeExp != elementSize)
-  {
-    log_error("Value of CL_IMAGE_ELEMENT_SIZE is different than expected (size: %i, exp size: %i)\n", elementSize, elementSizeExp);
-    result = false;
-  }
-
-  size_t rowPitch = 0;
-  if (clGetImageInfo(object, CL_IMAGE_ROW_PITCH, sizeof(size_t), &rowPitch, 0) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_ROW_PITCH) failed\n");
-    result = false;
-  }
-
-  if ((rowPitchExp == 0 && rowPitchExp != rowPitch) || (rowPitchExp > 0 && rowPitchExp > rowPitch))
-  {
-    log_error("Value of CL_IMAGE_ROW_PITCH is different than expected (size: %i, exp size: %i)\n", rowPitch, rowPitchExp);
-    result = false;
-  }
-
-  size_t slicePitch = 0;
-  if (clGetImageInfo(object, CL_IMAGE_SLICE_PITCH, sizeof(size_t), &slicePitch, 0) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_SLICE_PITCH) failed\n");
-    result = false;
-  }
-
-  if ((slicePitchExp == 0 && slicePitchExp != slicePitch) || (slicePitchExp > 0 && slicePitchExp > slicePitch))
-  {
-    log_error("Value of CL_IMAGE_SLICE_PITCH is different than expected (size: %i, exp size: %i)\n", slicePitch, slicePitchExp);
-    result = false;
-  }
-
-  size_t width = 0;
-  if (clGetImageInfo(object, CL_IMAGE_WIDTH, sizeof(size_t), &width, 0) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_WIDTH) failed\n");
-    result = false;
-  }
-
-  if (widthExp != width)
-  {
-    log_error("Value of CL_IMAGE_WIDTH is different than expected (size: %i, exp size: %i)\n", width, widthExp);
-    result = false;
-  }
-
-  size_t height = 0;
-  if (clGetImageInfo(object, CL_IMAGE_HEIGHT, sizeof(size_t), &height, 0) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_HEIGHT) failed\n");
-    result = false;
-  }
-
-  if (heightExp != height)
-  {
-    log_error("Value of CL_IMAGE_HEIGHT is different than expected (size: %i, exp size: %i)\n", height, heightExp);
-    result = false;
-  }
-
-  size_t depth = 0;
-  if (clGetImageInfo(object, CL_IMAGE_DEPTH, sizeof(size_t), &depth, 0) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_DEPTH) failed\n");
-    result = false;
-  }
-
-  if (depthExp != depth)
-  {
-    log_error("Value of CL_IMAGE_DEPTH is different than expected (size: %i, exp size: %i)\n", depth, depthExp);
-    result = false;
-  }
-
-  unsigned int plane = 99;
-  size_t paramSize = 0;
-  if (clGetImageInfo(object, CL_IMAGE_DX9_MEDIA_PLANE_KHR, sizeof(unsigned int), &plane, &paramSize) != CL_SUCCESS)
-  {
-    log_error("clGetImageInfo(CL_IMAGE_MEDIA_SURFACE_PLANE_KHR) failed\n");
-    result = false;
-  }
-
-  if (planeExp != plane)
-  {
-    log_error("Value of CL_IMAGE_MEDIA_SURFACE_PLANE_KHR is different than expected (plane: %i, exp plane: %i)\n", plane, planeExp);
-    result = false;
-  }
-
-  return result;
-}
-
-bool GetMemObjInfo( cl_mem object, cl_dx9_media_adapter_type_khr adapterType,  std::auto_ptr<CSurfaceWrapper> &surface, void *shareHandleExp )
-{
-  bool result = true;
-  switch(adapterType)
-  {
-  case CL_ADAPTER_D3D9_KHR:
-  case CL_ADAPTER_D3D9EX_KHR:
-  case CL_ADAPTER_DXVA_KHR:
-    {
-#if defined(_WIN32)
-      cl_dx9_surface_info_khr surfaceInfo;
-#else
-      void *surfaceInfo = 0;
-      return false;
-#endif
-      size_t paramSize = 0;
-      if(clGetMemObjectInfo(object, CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR, sizeof(surfaceInfo), &surfaceInfo, &paramSize) != CL_SUCCESS)
-      {
-        log_error("clGetImageInfo(CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR) failed\n");
-        result = false;
-      }
-
-#if defined(_WIN32)
-      CD3D9SurfaceWrapper *d3d9Surface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-      if (*d3d9Surface != surfaceInfo.resource)
-      {
-        log_error("Invalid resource for CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR\n");
-        result = false;
-      }
-
-      if (shareHandleExp != surfaceInfo.shared_handle)
-      {
-        log_error("Invalid shared handle for CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR\n");
-        result = false;
-      }
-#else
-      return false;
-#endif
-
-      if (paramSize != sizeof(surfaceInfo))
-      {
-        log_error("Invalid CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR parameter size: %i, expected: %i\n", paramSize, sizeof(surfaceInfo));
-        result = false;
-      }
-
-      paramSize = 0;
-      cl_dx9_media_adapter_type_khr mediaAdapterType;
-      if(clGetMemObjectInfo(object, CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR, sizeof(mediaAdapterType), &mediaAdapterType, &paramSize) != CL_SUCCESS)
-      {
-        log_error("clGetImageInfo(CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR) failed\n");
-        result = false;
-      }
-
-      if (adapterType != mediaAdapterType)
-      {
-        log_error("Invalid media adapter type for CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR\n");
-        result = false;
-      }
-
-      if (paramSize != sizeof(mediaAdapterType))
-      {
-        log_error("Invalid CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR parameter size: %i, expected: %i\n", paramSize, sizeof(mediaAdapterType));
-        result = false;
-      }
-    }
-    break;
-  default:
-    log_error("GetMemObjInfo(): Unknown adapter type!\n");
-    return false;
-    break;
-  }
-
-  return result;
-}
-
-bool ImageInfoVerify( cl_dx9_media_adapter_type_khr adapterType, const std::vector<cl_mem> &memObjList, unsigned int width, unsigned int height,
-                     std::auto_ptr<CSurfaceWrapper> &surface, void *sharedHandle)
-{
-  if (memObjList.size() != 2 && memObjList.size() != 3)
-  {
-    log_error("ImageInfoVerify(): Invalid object list parameter\n");
-    return false;
-  }
-
-  cl_image_format formatPlane;
-  formatPlane.image_channel_data_type = CL_UNORM_INT8;
-  formatPlane.image_channel_order = CL_R;
-
-  //plane 0 verification
-  if (!GetImageInfo(memObjList[0], formatPlane, sizeof(cl_uchar),
-    width * sizeof(cl_uchar),
-    0,
-    width, height, 0, 0))
-  {
-    log_error("clGetImageInfo failed\n");
-    return false;
-  }
-
-  switch (memObjList.size())
-  {
-  case 2:
-    {
-      formatPlane.image_channel_data_type = CL_UNORM_INT8;
-      formatPlane.image_channel_order = CL_RG;
-      if (!GetImageInfo(memObjList[1], formatPlane, sizeof(cl_uchar) * 2,
-        width * sizeof(cl_uchar),
-        0,
-        width / 2, height / 2, 0, 1))
-      {
-        log_error("clGetImageInfo failed\n");
-        return false;
-      }
-    }
-    break;
-  case 3:
-    {
-      if (!GetImageInfo(memObjList[1], formatPlane, sizeof(cl_uchar),
-        width * sizeof(cl_uchar) / 2,
-        0,
-        width / 2, height / 2, 0, 1))
-      {
-        log_error("clGetImageInfo failed\n");
-        return false;
-      }
-
-      if (!GetImageInfo(memObjList[2], formatPlane, sizeof(cl_uchar),
-        width * sizeof(cl_uchar) / 2,
-        0,
-        width / 2, height / 2, 0, 2))
-      {
-        log_error("clGetImageInfo failed\n");
-        return false;
-      }
-    }
-    break;
-  default:
-    log_error("ImageInfoVerify(): Invalid object list parameter\n");
-    return false;
-    break;
-  }
-
-  for (size_t i = 0; i < memObjList.size(); ++i)
-  {
-    if (!GetMemObjInfo(memObjList[i], adapterType, surface, sharedHandle))
-    {
-      log_error("clGetMemObjInfo(%i) failed\n", i);
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool ImageFormatCheck(cl_context context, cl_mem_object_type imageType, const cl_image_format imageFormatCheck)
-{
-  cl_uint imageFormatsNum = 0;
-  cl_int error = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, imageType, 0, 0, &imageFormatsNum);
-  if(error != CL_SUCCESS)
-  {
-    log_error("clGetSupportedImageFormats failed\n");
-    return false;
-  }
-
-  if(imageFormatsNum < 1)
-  {
-    log_error("Invalid image format number returned by clGetSupportedImageFormats\n");
-    return false;
-  }
-
-  std::vector<cl_image_format> imageFormats(imageFormatsNum);
-  error = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, imageType, imageFormatsNum, &imageFormats[0], 0);
-  if(error != CL_SUCCESS)
-  {
-    log_error("clGetSupportedImageFormats failed\n");
-    return false;
-  }
-
-  for(cl_uint i = 0; i < imageFormatsNum; ++i)
-  {
-    if(imageFormats[i].image_channel_data_type == imageFormatCheck.image_channel_data_type
-      && imageFormats[i].image_channel_order == imageFormatCheck.image_channel_order)
-    {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-unsigned int ChannelNum( TSurfaceFormat surfaceFormat )
-{
-  switch(surfaceFormat)
-  {
-  case SURFACE_FORMAT_R32F:
-  case SURFACE_FORMAT_R16F:
-  case SURFACE_FORMAT_L16:
-  case SURFACE_FORMAT_A8:
-  case SURFACE_FORMAT_L8:
-    return 1;
-    break;
-  case SURFACE_FORMAT_G32R32F:
-  case SURFACE_FORMAT_G16R16F:
-  case SURFACE_FORMAT_G16R16:
-  case SURFACE_FORMAT_A8L8:
-    return 2;
-    break;
-  case SURFACE_FORMAT_NV12:
-  case SURFACE_FORMAT_YV12:
-    return 3;
-    break;
-  case SURFACE_FORMAT_A32B32G32R32F:
-  case SURFACE_FORMAT_A16B16G16R16F:
-  case SURFACE_FORMAT_A16B16G16R16:
-  case SURFACE_FORMAT_A8B8G8R8:
-  case SURFACE_FORMAT_X8B8G8R8:
-  case SURFACE_FORMAT_A8R8G8B8:
-  case SURFACE_FORMAT_X8R8G8B8:
-    return 4;
-    break;
-  default:
-    log_error("ChannelNum(): unknown surface format!\n");
-    return 0;
-    break;
-  }
-}
-
-unsigned int PlanesNum( TSurfaceFormat surfaceFormat )
-{
-  switch(surfaceFormat)
-  {
-  case SURFACE_FORMAT_R32F:
-  case SURFACE_FORMAT_R16F:
-  case SURFACE_FORMAT_L16:
-  case SURFACE_FORMAT_A8:
-  case SURFACE_FORMAT_L8:
-  case SURFACE_FORMAT_G32R32F:
-  case SURFACE_FORMAT_G16R16F:
-  case SURFACE_FORMAT_G16R16:
-  case SURFACE_FORMAT_A8L8:
-  case SURFACE_FORMAT_A32B32G32R32F:
-  case SURFACE_FORMAT_A16B16G16R16F:
-  case SURFACE_FORMAT_A16B16G16R16:
-  case SURFACE_FORMAT_A8B8G8R8:
-  case SURFACE_FORMAT_X8B8G8R8:
-  case SURFACE_FORMAT_A8R8G8B8:
-  case SURFACE_FORMAT_X8R8G8B8:
-    return 1;
-    break;
-  case SURFACE_FORMAT_NV12:
-    return 2;
-    break;
-  case SURFACE_FORMAT_YV12:
-    return 3;
-    break;
-  default:
-    log_error("PlanesNum(): unknown surface format!\n");
-    return 0;
-    break;
-  }
-}
-
-#if defined(_WIN32)
-D3DFORMAT SurfaceFormatToD3D(TSurfaceFormat surfaceFormat)
-{
-  switch(surfaceFormat)
-  {
-  case SURFACE_FORMAT_R32F:
-    return D3DFMT_R32F;
-    break;
-  case SURFACE_FORMAT_R16F:
-    return D3DFMT_R16F;
-    break;
-  case SURFACE_FORMAT_L16:
-    return D3DFMT_L16;
-    break;
-  case SURFACE_FORMAT_A8:
-    return D3DFMT_A8;
-    break;
-  case SURFACE_FORMAT_L8:
-    return D3DFMT_L8;
-    break;
-  case SURFACE_FORMAT_G32R32F:
-    return D3DFMT_G32R32F;
-    break;
-  case SURFACE_FORMAT_G16R16F:
-    return D3DFMT_G16R16F;
-    break;
-  case SURFACE_FORMAT_G16R16:
-    return D3DFMT_G16R16;
-    break;
-  case SURFACE_FORMAT_A8L8:
-    return D3DFMT_A8L8;
-    break;
-  case SURFACE_FORMAT_A32B32G32R32F:
-    return D3DFMT_A32B32G32R32F;
-    break;
-  case SURFACE_FORMAT_A16B16G16R16F:
-    return D3DFMT_A16B16G16R16F;
-    break;
-  case SURFACE_FORMAT_A16B16G16R16:
-    return D3DFMT_A16B16G16R16;
-    break;
-  case SURFACE_FORMAT_A8B8G8R8:
-    return D3DFMT_A8B8G8R8;
-    break;
-  case SURFACE_FORMAT_X8B8G8R8:
-    return D3DFMT_X8B8G8R8;
-    break;
-  case SURFACE_FORMAT_A8R8G8B8:
-    return D3DFMT_A8R8G8B8;
-    break;
-  case SURFACE_FORMAT_X8R8G8B8:
-    return D3DFMT_X8R8G8B8;
-    break;
-  case SURFACE_FORMAT_NV12:
-    return static_cast<D3DFORMAT>(MAKEFOURCC('N', 'V', '1', '2'));
-    break;
-  case SURFACE_FORMAT_YV12:
-    return static_cast<D3DFORMAT>(MAKEFOURCC('Y', 'V', '1', '2'));
-    break;
-  default:
-    log_error("SurfaceFormatToD3D(): unknown surface format!\n");
-    return D3DFMT_R32F;
-    break;
-  }
-}
-#endif
-
-bool DeviceCreate( cl_dx9_media_adapter_type_khr adapterType, std::auto_ptr<CDeviceWrapper> &device )
-{
-  switch (adapterType)
-  {
-#if defined(_WIN32)
-  case CL_ADAPTER_D3D9_KHR:
-    device = std::auto_ptr<CDeviceWrapper>(new CD3D9Wrapper());
-    break;
-  case CL_ADAPTER_D3D9EX_KHR:
-    device = std::auto_ptr<CDeviceWrapper>(new CD3D9ExWrapper());
-    break;
-  case CL_ADAPTER_DXVA_KHR:
-    device = std::auto_ptr<CDeviceWrapper>(new CDXVAWrapper());
-    break;
-#endif
-  default:
-    log_error("DeviceCreate(): Unknown adapter type!\n");
-    return false;
-    break;
-  }
-
-  return device->Status();
-}
-
-bool SurfaceFormatCheck( cl_dx9_media_adapter_type_khr adapterType, const CDeviceWrapper &device, TSurfaceFormat surfaceFormat )
-{
-  switch (adapterType)
-  {
-#if defined(_WIN32)
-  case CL_ADAPTER_D3D9_KHR:
-  case CL_ADAPTER_D3D9EX_KHR:
-  case CL_ADAPTER_DXVA_KHR:
-    {
-      D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
-      LPDIRECT3D9 d3d9 = static_cast<LPDIRECT3D9>(device.D3D());
-      D3DDISPLAYMODE d3ddm;
-      d3d9->GetAdapterDisplayMode(device.AdapterIdx(), &d3ddm);
-
-      if( FAILED(d3d9->CheckDeviceFormat(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, d3ddm.Format, 0, D3DRTYPE_SURFACE, d3dFormat)) )
-        return false;
-    }
-    break;
-#endif
-  default:
-    log_error("SurfaceFormatCheck(): Unknown adapter type!\n");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-bool SurfaceFormatToOCL(TSurfaceFormat surfaceFormat, cl_image_format &format)
-{
-  switch(surfaceFormat)
-  {
-  case SURFACE_FORMAT_R32F:
-    format.image_channel_order = CL_R;
-    format.image_channel_data_type = CL_FLOAT;
-    break;
-  case SURFACE_FORMAT_R16F:
-    format.image_channel_order = CL_R;
-    format.image_channel_data_type = CL_HALF_FLOAT;
-    break;
-  case SURFACE_FORMAT_L16:
-    format.image_channel_order = CL_R;
-    format.image_channel_data_type = CL_UNORM_INT16;
-    break;
-  case SURFACE_FORMAT_A8:
-    format.image_channel_order = CL_A;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_L8:
-    format.image_channel_order = CL_R;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_G32R32F:
-    format.image_channel_order = CL_RG;
-    format.image_channel_data_type = CL_FLOAT;
-    break;
-  case SURFACE_FORMAT_G16R16F:
-    format.image_channel_order = CL_RG;
-    format.image_channel_data_type = CL_HALF_FLOAT;
-    break;
-  case SURFACE_FORMAT_G16R16:
-    format.image_channel_order = CL_RG;
-    format.image_channel_data_type = CL_UNORM_INT16;
-    break;
-  case SURFACE_FORMAT_A8L8:
-    format.image_channel_order = CL_RG;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_A32B32G32R32F:
-    format.image_channel_order = CL_RGBA;
-    format.image_channel_data_type = CL_FLOAT;
-    break;
-  case SURFACE_FORMAT_A16B16G16R16F:
-    format.image_channel_order = CL_RGBA;
-    format.image_channel_data_type = CL_HALF_FLOAT;
-    break;
-  case SURFACE_FORMAT_A16B16G16R16:
-    format.image_channel_order = CL_RGBA;
-    format.image_channel_data_type = CL_UNORM_INT16;
-    break;
-  case SURFACE_FORMAT_A8B8G8R8:
-    format.image_channel_order = CL_RGBA;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_X8B8G8R8:
-    format.image_channel_order = CL_RGBA;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_A8R8G8B8:
-    format.image_channel_order = CL_BGRA;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_X8R8G8B8:
-    format.image_channel_order = CL_BGRA;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_NV12:
-    format.image_channel_order = CL_R;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case SURFACE_FORMAT_YV12:
-    format.image_channel_order = CL_R;
-    format.image_channel_data_type = CL_UNORM_INT8;
-    break;
-  default:
-    log_error("SurfaceFormatToOCL(): Unknown surface format!\n");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-void SurfaceFormatToString( TSurfaceFormat surfaceFormat, std::string &str )
-{
-  switch(surfaceFormat)
-  {
-  case SURFACE_FORMAT_R32F:
-    str = "R32F";
-    break;
-  case SURFACE_FORMAT_R16F:
-    str = "R16F";
-    break;
-  case SURFACE_FORMAT_L16:
-    str = "L16";
-    break;
-  case SURFACE_FORMAT_A8:
-    str = "A8";
-    break;
-  case SURFACE_FORMAT_L8:
-    str = "L8";
-    break;
-  case SURFACE_FORMAT_G32R32F:
-    str = "G32R32F";
-    break;
-  case SURFACE_FORMAT_G16R16F:
-    str = "G16R16F";
-    break;
-  case SURFACE_FORMAT_G16R16:
-    str = "G16R16";
-    break;
-  case SURFACE_FORMAT_A8L8:
-    str = "A8L8";
-    break;
-  case SURFACE_FORMAT_A32B32G32R32F:
-    str = "A32B32G32R32F";
-    break;
-  case SURFACE_FORMAT_A16B16G16R16F:
-    str = "A16B16G16R16F";
-    break;
-  case SURFACE_FORMAT_A16B16G16R16:
-    str = "A16B16G16R16";
-    break;
-  case SURFACE_FORMAT_A8B8G8R8:
-    str = "A8B8G8R8";
-    break;
-  case SURFACE_FORMAT_X8B8G8R8:
-    str = "X8B8G8R8";
-    break;
-  case SURFACE_FORMAT_A8R8G8B8:
-    str = "A8R8G8B8";
-    break;
-  case SURFACE_FORMAT_X8R8G8B8:
-    str = "X8R8G8B8";
-    break;
-  case SURFACE_FORMAT_NV12:
-    str = "NV12";
-    break;
-  case SURFACE_FORMAT_YV12:
-    str = "YV12";
-    break;
-  default:
-    log_error("SurfaceFormatToString(): unknown surface format!\n");
-    str = "unknown";
-    break;
-  }
-}
-
-bool MediaSurfaceCreate(cl_dx9_media_adapter_type_khr adapterType, unsigned int width, unsigned int height, TSurfaceFormat surfaceFormat,
-                        CDeviceWrapper &device, std::auto_ptr<CSurfaceWrapper> &surface, bool sharedHandle, void **objectSharedHandle)
-{
-  switch (adapterType)
-  {
-#if defined(_WIN32)
-  case CL_ADAPTER_D3D9_KHR:
-    {
-      surface = std::auto_ptr<CD3D9SurfaceWrapper>(new CD3D9SurfaceWrapper);
-      CD3D9SurfaceWrapper *d3dSurface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-      HRESULT hr = 0;
-      D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
-      LPDIRECT3DDEVICE9 d3d9Device = (LPDIRECT3DDEVICE9)device.Device();
-      hr = d3d9Device->CreateOffscreenPlainSurface(width, height, d3dFormat, D3DPOOL_DEFAULT, &(*d3dSurface),
-        sharedHandle ? objectSharedHandle: 0);
-
-      if ( FAILED(hr))
-      {
-        log_error("CreateOffscreenPlainSurface failed\n");
-        return false;
-      }
-    }
-    break;
-  case CL_ADAPTER_D3D9EX_KHR:
-    {
-      surface = std::auto_ptr<CD3D9SurfaceWrapper>(new CD3D9SurfaceWrapper);
-      CD3D9SurfaceWrapper *d3dSurface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-      HRESULT hr = 0;
-      D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
-      LPDIRECT3DDEVICE9EX d3d9ExDevice = (LPDIRECT3DDEVICE9EX)device.Device();
-      hr = d3d9ExDevice->CreateOffscreenPlainSurface(width, height, d3dFormat, D3DPOOL_DEFAULT, &(*d3dSurface),
-        sharedHandle ? objectSharedHandle: 0);
-
-      if ( FAILED(hr))
-      {
-        log_error("CreateOffscreenPlainSurface failed\n");
-        return false;
-      }
-    }
-    break;
-  case CL_ADAPTER_DXVA_KHR:
-    {
-      surface = std::auto_ptr<CD3D9SurfaceWrapper>(new CD3D9SurfaceWrapper);
-      CD3D9SurfaceWrapper *d3dSurface = static_cast<CD3D9SurfaceWrapper *>(surface.get());
-      HRESULT hr = 0;
-      D3DFORMAT d3dFormat = SurfaceFormatToD3D(surfaceFormat);
-      IDXVAHD_Device *dxvaDevice = (IDXVAHD_Device *)device.Device();
-      hr = dxvaDevice->CreateVideoSurface(width, height, d3dFormat, D3DPOOL_DEFAULT, 0,
-        DXVAHD_SURFACE_TYPE_VIDEO_INPUT,  1, &(*d3dSurface), sharedHandle ? objectSharedHandle: 0);
-
-      if ( FAILED(hr))
-      {
-        log_error("CreateVideoSurface failed\n");
-        return false;
-      }
-    }
-    break;
-#endif
-  default:
-    log_error("MediaSurfaceCreate(): Unknown adapter type!\n");
-    return false;
-    break;
-  }
-
-  return true;
-}
-
-cl_int deviceExistForCLTest(cl_platform_id platform,
-     cl_dx9_media_adapter_type_khr media_adapters_type,
-     void *media_adapters,
-     CResult &result,
-     TSharedHandleType sharedHandle /*default SHARED_HANDLE_ENABLED*/
-     )
-{
-    cl_int _error;
-    cl_uint devicesAllNum = 0;
-    std::string sharedHandleStr = (sharedHandle == SHARED_HANDLE_ENABLED)? "yes": "no";
-    std::string adapterStr;
-    AdapterToString(media_adapters_type, adapterStr);
-
-    _error = clGetDeviceIDsFromDX9MediaAdapterKHR(platform, 1,
-        &media_adapters_type, &media_adapters, CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 0, 0, &devicesAllNum);
-
-    if (_error != CL_SUCCESS)
-    {
-        if(_error != CL_DEVICE_NOT_FOUND)
-        {
-           log_error("clGetDeviceIDsFromDX9MediaAdapterKHR failed: %s\n", IGetErrorString(_error));
-           result.ResultSub(CResult::TEST_ERROR);
-        }
-        else
-        {
-          log_info("Skipping test case, device type is not supported by a device (adapter type: %s, shared handle: %s)\n", adapterStr.c_str(), sharedHandleStr.c_str());
-          result.ResultSub(CResult::TEST_NOTSUPPORTED);
-        }
-    }
-
-    return _error;
-}
diff --git a/test_extensions/media_sharing/utils.h b/test_extensions/media_sharing/utils.h
deleted file mode 100644
index f98090ca83..0000000000
--- a/test_extensions/media_sharing/utils.h
+++ /dev/null
@@ -1,167 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef __UTILS_KHR_MEDIA_H
-#define __UTILS_KHR_MEDIA_H
-
-#include <string>
-#include <iostream>
-#include <memory>
-#include <vector>
-#include "wrappers.h"
-#include "CL/cl_dx9_media_sharing.h"
-
-#include "harness/typeWrappers.h"
-
-
-
-
-
-extern clGetDeviceIDsFromDX9MediaAdapterKHR_fn clGetDeviceIDsFromDX9MediaAdapterKHR;
-extern clCreateFromDX9MediaSurfaceKHR_fn clCreateFromDX9MediaSurfaceKHR;
-extern clEnqueueAcquireDX9MediaSurfacesKHR_fn clEnqueueAcquireDX9MediaSurfacesKHR;
-extern clEnqueueReleaseDX9MediaSurfacesKHR_fn clEnqueueReleaseDX9MediaSurfacesKHR;
-
-extern cl_platform_id gPlatformIDdetected;
-extern cl_device_id gDeviceIDdetected;
-extern cl_device_type gDeviceTypeSelected;
-
-#define NL "\n"
-#define TEST_NOT_IMPLEMENTED -1
-#define TEST_NOT_SUPPORTED -2
-
-enum TSurfaceFormat
-{
-  SURFACE_FORMAT_NV12,
-  SURFACE_FORMAT_YV12,
-  SURFACE_FORMAT_R32F,
-  SURFACE_FORMAT_R16F,
-  SURFACE_FORMAT_L16,
-  SURFACE_FORMAT_A8,
-  SURFACE_FORMAT_L8,
-  SURFACE_FORMAT_G32R32F,
-  SURFACE_FORMAT_G16R16F,
-  SURFACE_FORMAT_G16R16,
-  SURFACE_FORMAT_A8L8,
-  SURFACE_FORMAT_A32B32G32R32F,
-  SURFACE_FORMAT_A16B16G16R16F,
-  SURFACE_FORMAT_A16B16G16R16,
-  SURFACE_FORMAT_A8B8G8R8,
-  SURFACE_FORMAT_X8B8G8R8,
-  SURFACE_FORMAT_A8R8G8B8,
-  SURFACE_FORMAT_X8R8G8B8,
-};
-
-enum TContextFuncType
-{
-  CONTEXT_CREATE_DEFAULT,
-  CONTEXT_CREATE_FROM_TYPE,
-};
-
-enum TSharedHandleType
-{
-  SHARED_HANDLE_ENABLED,
-  SHARED_HANDLE_DISABLED,
-};
-
-class CResult {
-public:
-  enum TTestResult {
-    TEST_NORESULT,
-    TEST_NOTSUPPORTED,
-    TEST_PASS,
-    TEST_FAIL,
-    TEST_ERROR,
-  };
-
-  CResult();
-  ~CResult();
-
-  void ResultSub(TTestResult result);
-  TTestResult ResultLast() const;
-  int Result() const;
-
-private:
-  TTestResult _result;
-  TTestResult _resultLast;
-};
-
-void FunctionContextCreateToString(TContextFuncType contextCreateFunction, std::string &contextFunction);
-void AdapterToString(cl_dx9_media_adapter_type_khr adapterType, std::string &adapter);
-cl_context_info AdapterTypeToContextInfo(cl_dx9_media_adapter_type_khr adapterType);
-
-//YUV utils
-void YUVGenerateNV12(std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height,
-                     cl_uchar valueMin, cl_uchar valueMax, double valueAdd = 0.0);
-void YUVGenerateYV12(std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height,
-                     cl_uchar valueMin, cl_uchar valueMax, double valueAdd = 0.0);
-bool YUVGenerate(TSurfaceFormat surfaceFormat, std::vector<cl_uchar> &yuv, unsigned int width, unsigned int height,
-                 cl_uchar valueMin, cl_uchar valueMax, double valueAdd = 0.0);
-bool YUVSurfaceSetNV12(std::auto_ptr<CSurfaceWrapper> &surface, const std::vector<cl_uchar> &yuv,
-                       unsigned int width, unsigned int height);
-bool YUVSurfaceSetYV12(std::auto_ptr<CSurfaceWrapper> &surface, const std::vector<cl_uchar> &yuv,
-                       unsigned int width, unsigned int height);
-bool YUVSurfaceSet(TSurfaceFormat surfaceFormat, std::auto_ptr<CSurfaceWrapper> &surface, const std::vector<cl_uchar> &yuv,
-                   unsigned int width, unsigned int height);
-bool YUVSurfaceGetNV12(std::auto_ptr<CSurfaceWrapper> &surface, std::vector<cl_uchar> &yuv,
-                       unsigned int width, unsigned int height);
-bool YUVSurfaceGetYV12(std::auto_ptr<CSurfaceWrapper> &surface, std::vector<cl_uchar> &yuv,
-                       unsigned int width, unsigned int height);
-bool YUVSurfaceGet(TSurfaceFormat surfaceFormat, std::auto_ptr<CSurfaceWrapper> &surface, std::vector<cl_uchar> &yuv,
-                   unsigned int width, unsigned int height);
-bool YUVCompareNV12(const std::vector<cl_uchar> &yuvTest, const std::vector<cl_uchar> &yuvRef,
-                    unsigned int width, unsigned int height);
-bool YUVCompareYV12(const std::vector<cl_uchar> &yuvTest, const std::vector<cl_uchar> &yuvRef,
-                    unsigned int width, unsigned int height);
-bool YUVCompare(TSurfaceFormat surfaceFormat, const std::vector<cl_uchar> &yuvTest, const std::vector<cl_uchar> &yuvRef,
-                unsigned int width, unsigned int height);
-
-//other types utils
-void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type, std::vector<float> &data, unsigned int width, unsigned int height,
-                  unsigned int channelNum, float cmin = 0.0f, float cmax = 1.0f, float add = 0.0f);
-void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type, std::vector<cl_half> &data, unsigned int width, unsigned int height,
-                  unsigned int channelNum, float cmin = 0.0f, float cmax = 1.0f, float add = 0.0f);
-void DataGenerate(TSurfaceFormat surfaceFormat, cl_channel_type type, std::vector<cl_uchar> &data, unsigned int width, unsigned int height,
-                  unsigned int channelNum, float cmin = 0.0f, float cmax = 1.0f, float add = 0.0f);
-bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type, const std::vector<cl_float> &dataTest, const std::vector<cl_float> &dataExp,
-                 unsigned int width, unsigned int height, unsigned int channelNum);
-bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type, const std::vector<cl_half> &dataTest, const std::vector<cl_half> &dataExp,
-                 unsigned int width, unsigned int height, unsigned int channelNum);
-bool DataCompare(TSurfaceFormat surfaceFormat, cl_channel_type type, const std::vector<cl_uchar> &dataTest, const std::vector<cl_uchar> &dataExp,
-                 unsigned int width, unsigned int height, unsigned int channelNum);
-
-bool GetImageInfo(cl_mem object, cl_image_format formatExp, size_t elementSizeExp,
-                  size_t rowPitchExp, size_t slicePitchExp, size_t widthExp,
-                  size_t heightExp, size_t depthExp, unsigned int planeExp);
-bool GetMemObjInfo(cl_mem object, cl_dx9_media_adapter_type_khr adapterType, std::auto_ptr<CSurfaceWrapper> &surface, void *shareHandleExp);
-bool ImageInfoVerify(cl_dx9_media_adapter_type_khr adapterType, const std::vector<cl_mem> &memObjList, unsigned int width, unsigned int height,
-                     std::auto_ptr<CSurfaceWrapper> &surface, void *sharedHandle);
-bool ImageFormatCheck(cl_context context, cl_mem_object_type imageType, const cl_image_format imageFormatCheck);
-unsigned int ChannelNum(TSurfaceFormat surfaceFormat);
-unsigned int PlanesNum(TSurfaceFormat surfaceFormat);
-
-#if defined(_WIN32)
-D3DFORMAT SurfaceFormatToD3D(TSurfaceFormat surfaceFormat);
-#endif
-
-bool DeviceCreate(cl_dx9_media_adapter_type_khr adapterType, std::auto_ptr<CDeviceWrapper> &device);
-bool SurfaceFormatCheck(cl_dx9_media_adapter_type_khr adapterType, const CDeviceWrapper &device, TSurfaceFormat surfaceFormat);
-bool SurfaceFormatToOCL(TSurfaceFormat surfaceFormat, cl_image_format &format);
-void SurfaceFormatToString(TSurfaceFormat surfaceFormat, std::string &str );
-bool MediaSurfaceCreate(cl_dx9_media_adapter_type_khr adapterType, unsigned int width, unsigned int height, TSurfaceFormat surfaceFormat,
-                      CDeviceWrapper &device, std::auto_ptr<CSurfaceWrapper> &surface, bool sharedHandle, void **objectSharedHandle);
-
-cl_int deviceExistForCLTest(cl_platform_id platform,cl_dx9_media_adapter_type_khr media_adapters_type,void *media_adapters,CResult &result,TSharedHandleType sharedHandle=SHARED_HANDLE_DISABLED);
-#endif  // __UTILS_KHR_MEDIA_H
diff --git a/test_extensions/media_sharing/wrappers.cpp b/test_extensions/media_sharing/wrappers.cpp
deleted file mode 100644
index e7eb5b2bfd..0000000000
--- a/test_extensions/media_sharing/wrappers.cpp
+++ /dev/null
@@ -1,562 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "wrappers.h"
-#include "harness/errorHelpers.h"
-
-LPCTSTR CDeviceWrapper::WINDOW_TITLE = _T( "cl_khr_dx9_media_sharing" );
-const int CDeviceWrapper::WINDOW_WIDTH = 256;
-const int CDeviceWrapper::WINDOW_HEIGHT = 256;
-CDeviceWrapper::TAccelerationType CDeviceWrapper::accelerationType = CDeviceWrapper::ACCELERATION_HW;
-
-#if defined(_WIN32)
-const D3DFORMAT CDXVAWrapper::RENDER_TARGET_FORMAT = D3DFMT_X8R8G8B8;
-const D3DFORMAT CDXVAWrapper::VIDEO_FORMAT = D3DFMT_X8R8G8B8;
-const unsigned int CDXVAWrapper::VIDEO_FPS = 60;
-#endif
-
-#if defined(_WIN32)
-static LRESULT WINAPI WndProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)
-{
-  switch(msg)
-  {
-  case WM_DESTROY:
-    PostQuitMessage(0);
-    return 0;
-  case WM_PAINT:
-    ValidateRect(hWnd, 0);
-    return 0;
-  default:
-    break;
-  }
-
-  return DefWindowProc(hWnd, msg, wParam, lParam);
-}
-#endif
-
-CDeviceWrapper::CDeviceWrapper()
-#if defined(_WIN32)
-:_hInstance(NULL),_hWnd(NULL)
-#endif
-{
-
-}
-
-void CDeviceWrapper::WindowInit()
-{
-#if defined(_WIN32)
-  _hInstance = GetModuleHandle(NULL);
-  static WNDCLASSEX wc =
-  {
-    sizeof(WNDCLASSEX),
-    CS_CLASSDC,
-    WndProc,
-    0L,
-    0L,
-    _hInstance,
-    NULL,
-    NULL,
-    NULL,
-    NULL,
-    WINDOW_TITLE,
-    NULL
-  };
-
-  RegisterClassEx(&wc);
-
-  _hWnd = CreateWindow(
-    WINDOW_TITLE,
-    WINDOW_TITLE,
-    WS_OVERLAPPEDWINDOW,
-    0, 0,
-    WINDOW_WIDTH, WINDOW_HEIGHT,
-    NULL,
-    NULL,
-    wc.hInstance,
-    NULL);
-
-  if (!_hWnd)
-  {
-    log_error("Failed to create window");
-    return;
-  }
-
-  ShowWindow(_hWnd,SW_SHOWDEFAULT);
-  UpdateWindow(_hWnd);
-#endif
-}
-
-void CDeviceWrapper::WindowDestroy()
-{
-#if defined(_WIN32)
-  if (_hWnd)
-    DestroyWindow(_hWnd);
-  _hWnd = NULL;
-#endif
-}
-
-#if defined(_WIN32)
-HWND CDeviceWrapper::WindowHandle() const
-{
-  return _hWnd;
-}
-#endif
-
-int CDeviceWrapper::WindowWidth() const
-{
-  return WINDOW_WIDTH;
-}
-
-int CDeviceWrapper::WindowHeight() const
-{
-  return WINDOW_HEIGHT;
-}
-
-CDeviceWrapper::TAccelerationType CDeviceWrapper::AccelerationType()
-{
-  return accelerationType;
-}
-
-void CDeviceWrapper::AccelerationType( TAccelerationType accelerationTypeNew )
-{
-  accelerationType = accelerationTypeNew;
-}
-
-CDeviceWrapper::~CDeviceWrapper()
-{
-  WindowDestroy();
-}
-
-#if defined(_WIN32)
-CD3D9Wrapper::CD3D9Wrapper():
-_d3d9(NULL), _d3dDevice(NULL), _status(DEVICE_PASS), _adapterIdx(0), _adapterFound(false)
-{
-  WindowInit();
-
-  _d3d9 = Direct3DCreate9(D3D_SDK_VERSION);
-  if (!_d3d9)
-  {
-    log_error("Direct3DCreate9 failed\n");
-    _status = DEVICE_FAIL;
-  }
-}
-
-CD3D9Wrapper::~CD3D9Wrapper()
-{
-  Destroy();
-
-  if(_d3d9)
-    _d3d9->Release();
-  _d3d9 = 0;
-}
-
-void CD3D9Wrapper::Destroy()
-{
-  if (_d3dDevice)
-    _d3dDevice->Release();
-  _d3dDevice = 0;
-}
-
-cl_int CD3D9Wrapper::Init()
-{
-  if (!WindowHandle())
-  {
-    log_error("D3D9: Window is not created\n");
-    _status = DEVICE_FAIL;
-    return DEVICE_FAIL;
-  }
-
-  if(!_d3d9 || DEVICE_PASS  != _status || !_adapterFound)
-    return false;
-
-  _d3d9->GetAdapterDisplayMode(_adapterIdx - 1, &_d3ddm);
-
-  D3DPRESENT_PARAMETERS d3dParams;
-  ZeroMemory(&d3dParams, sizeof(d3dParams));
-
-  d3dParams.Windowed = TRUE;
-  d3dParams.BackBufferCount = 1;
-  d3dParams.SwapEffect = D3DSWAPEFFECT_DISCARD;
-  d3dParams.hDeviceWindow = WindowHandle();
-  d3dParams.BackBufferWidth = WindowWidth();
-  d3dParams.BackBufferHeight = WindowHeight();
-  d3dParams.BackBufferFormat = _d3ddm.Format;
-
-  DWORD processingType = (AccelerationType() == ACCELERATION_HW)? D3DCREATE_HARDWARE_VERTEXPROCESSING:
-    D3DCREATE_SOFTWARE_VERTEXPROCESSING;
-
-  if ( FAILED( _d3d9->CreateDevice( _adapterIdx - 1, D3DDEVTYPE_HAL, WindowHandle(),
-    processingType, &d3dParams, &_d3dDevice) ) )
-  {
-    log_error("CreateDevice failed\n");
-    _status = DEVICE_FAIL;
-    return DEVICE_FAIL;
-  }
-
-  _d3dDevice->BeginScene();
-  _d3dDevice->Clear(0, NULL, D3DCLEAR_TARGET, 0, 1.0f, 0);
-  _d3dDevice->EndScene();
-
-  return true;
-}
-
-void * CD3D9Wrapper::D3D() const
-{
-  return _d3d9;
-}
-
-void *CD3D9Wrapper::Device() const
-{
-  return _d3dDevice;
-}
-
-D3DFORMAT CD3D9Wrapper::Format()
-{
-  return _d3ddm.Format;
-}
-
-D3DADAPTER_IDENTIFIER9 CD3D9Wrapper::Adapter()
-{
-  return _adapter;
-}
-
-TDeviceStatus CD3D9Wrapper::Status() const
-{
-  return _status;
-}
-
-bool CD3D9Wrapper::AdapterNext()
-{
-  if (DEVICE_PASS != _status)
-    return false;
-
-  _adapterFound = false;
-  for(; _adapterIdx < _d3d9->GetAdapterCount();)
-  {
-    ++_adapterIdx;
-    D3DCAPS9 caps;
-    if (FAILED(_d3d9->GetDeviceCaps(_adapterIdx - 1, D3DDEVTYPE_HAL, &caps)))
-      continue;
-
-    if(FAILED(_d3d9->GetAdapterIdentifier(_adapterIdx - 1, 0, &_adapter)))
-    {
-      log_error("D3D9: GetAdapterIdentifier failed\n");
-      _status = DEVICE_FAIL;
-      return false;
-    }
-
-    _adapterFound = true;
-
-    Destroy();
-    if(!Init())
-    {
-      _status = DEVICE_FAIL;
-      _adapterFound = false;
-    }
-    break;
-  }
-
-  return _adapterFound;
-}
-
-unsigned int CD3D9Wrapper::AdapterIdx() const
-{
-  return _adapterIdx - 1;
-}
-
-
-CD3D9ExWrapper::CD3D9ExWrapper():
-_d3d9Ex(NULL), _d3dDeviceEx(NULL), _status(DEVICE_PASS), _adapterIdx(0), _adapterFound(false)
-{
-  WindowInit();
-
-  HRESULT result = Direct3DCreate9Ex(D3D_SDK_VERSION, &_d3d9Ex);
-  if (FAILED(result) || !_d3d9Ex)
-  {
-    log_error("Direct3DCreate9Ex failed\n");
-    _status = DEVICE_FAIL;
-  }
-}
-
-CD3D9ExWrapper::~CD3D9ExWrapper()
-{
-  Destroy();
-
-  if(_d3d9Ex)
-    _d3d9Ex->Release();
-  _d3d9Ex = 0;
-}
-
-void * CD3D9ExWrapper::D3D() const
-{
-  return _d3d9Ex;
-}
-
-void *CD3D9ExWrapper::Device() const
-{
-  return _d3dDeviceEx;
-}
-
-D3DFORMAT CD3D9ExWrapper::Format()
-{
-  return _d3ddmEx.Format;
-}
-
-D3DADAPTER_IDENTIFIER9 CD3D9ExWrapper::Adapter()
-{
-  return _adapter;
-}
-
-cl_int CD3D9ExWrapper::Init()
-{
-  if (!WindowHandle())
-  {
-    log_error("D3D9EX: Window is not created\n");
-    _status = DEVICE_FAIL;
-    return DEVICE_FAIL;
-  }
-
-  if(!_d3d9Ex || DEVICE_FAIL == _status || !_adapterFound)
-    return DEVICE_FAIL;
-
-  RECT rect;
-  GetClientRect(WindowHandle(),&rect);
-
-  D3DPRESENT_PARAMETERS d3dParams;
-  ZeroMemory(&d3dParams, sizeof(d3dParams));
-
-  d3dParams.Windowed = TRUE;
-  d3dParams.SwapEffect = D3DSWAPEFFECT_FLIP;
-  d3dParams.BackBufferFormat = D3DFMT_X8R8G8B8;
-  d3dParams.BackBufferWidth = WindowWidth();
-  d3dParams.BackBufferHeight = WindowHeight();
-
-  d3dParams.BackBufferCount = 1;
-  d3dParams.hDeviceWindow = WindowHandle();
-
-  DWORD processingType = (AccelerationType() == ACCELERATION_HW)? D3DCREATE_HARDWARE_VERTEXPROCESSING:
-    D3DCREATE_SOFTWARE_VERTEXPROCESSING;
-
-  if ( FAILED( _d3d9Ex->CreateDeviceEx( _adapterIdx - 1, D3DDEVTYPE_HAL, WindowHandle(),
-    processingType, &d3dParams, NULL, &_d3dDeviceEx) ) )
-  {
-    log_error("CreateDeviceEx failed\n");
-    _status = DEVICE_FAIL;
-    return DEVICE_FAIL;
-  }
-
-  _d3dDeviceEx->BeginScene();
-  _d3dDeviceEx->Clear(0, NULL, D3DCLEAR_TARGET, 0, 1.0f, 0);
-  _d3dDeviceEx->EndScene();
-
-  return DEVICE_PASS;
-}
-
-void CD3D9ExWrapper::Destroy()
-{
-  if (_d3dDeviceEx)
-    _d3dDeviceEx->Release();
-  _d3dDeviceEx = 0;
-}
-
-TDeviceStatus CD3D9ExWrapper::Status() const
-{
-  return _status;
-}
-
-bool CD3D9ExWrapper::AdapterNext()
-{
-  if (DEVICE_FAIL == _status)
-    return false;
-
-  _adapterFound = false;
-  for(; _adapterIdx < _d3d9Ex->GetAdapterCount();)
-  {
-    ++_adapterIdx;
-    D3DCAPS9 caps;
-    if (FAILED(_d3d9Ex->GetDeviceCaps(_adapterIdx - 1, D3DDEVTYPE_HAL, &caps)))
-      continue;
-
-    if(FAILED(_d3d9Ex->GetAdapterIdentifier(_adapterIdx - 1, 0, &_adapter)))
-    {
-      log_error("D3D9EX: GetAdapterIdentifier failed\n");
-      _status = DEVICE_FAIL;
-      return false;
-    }
-
-    _adapterFound = true;
-    Destroy();
-    if(!Init())
-    {
-      _status = DEVICE_FAIL;
-      _adapterFound = _status;
-    }
-
-    break;
-  }
-
-  return _adapterFound;
-}
-
-unsigned int CD3D9ExWrapper::AdapterIdx() const
-{
-  return _adapterIdx - 1;
-}
-
-CDXVAWrapper::CDXVAWrapper():
-_dxvaDevice(NULL), _status(DEVICE_PASS), _adapterFound(false)
-{
-  _status = _d3d9.Status();
-}
-
-CDXVAWrapper::~CDXVAWrapper()
-{
-  DXVAHDDestroy();
-}
-
-void * CDXVAWrapper::Device() const
-{
-  return _dxvaDevice;
-}
-
-TDeviceStatus CDXVAWrapper::Status() const
-{
-    if(_status == DEVICE_FAIL || _d3d9.Status() == DEVICE_FAIL)
-        return DEVICE_FAIL;
-    else if(_status == DEVICE_NOTSUPPORTED || _d3d9.Status() == DEVICE_NOTSUPPORTED)
-        return DEVICE_NOTSUPPORTED;
-    else
-        return DEVICE_PASS;
-}
-
-bool CDXVAWrapper::AdapterNext()
-{
-  if (DEVICE_PASS != _status)
-    return false;
-
-  _adapterFound = _d3d9.AdapterNext();
-  _status = _d3d9.Status();
-  if (DEVICE_PASS != _status)
-  {
-    _adapterFound = false;
-    return false;
-  }
-
-  if (!_adapterFound)
-    return false;
-
-  DXVAHDDestroy();
-  _status = DXVAHDInit();
-  if (DEVICE_PASS != _status)
-  {
-    _adapterFound = false;
-    return false;
-  }
-
-  return true;
-}
-
-TDeviceStatus CDXVAWrapper::DXVAHDInit()
-{
-  if ((_status == DEVICE_FAIL) || (_d3d9.Status() == DEVICE_FAIL) || !_adapterFound)
-    return DEVICE_FAIL;
-
-  DXVAHD_RATIONAL fps = { VIDEO_FPS, 1 };
-
-  DXVAHD_CONTENT_DESC desc;
-  desc.InputFrameFormat= DXVAHD_FRAME_FORMAT_PROGRESSIVE;
-  desc.InputFrameRate = fps;
-  desc.InputWidth = WindowWidth();
-  desc.InputHeight = WindowHeight();
-  desc.OutputFrameRate = fps;
-  desc.OutputWidth = WindowWidth();
-  desc.OutputHeight = WindowHeight();
-
-#ifdef USE_SOFTWARE_PLUGIN
-  _status = DEVICE_FAIL;
-  return DEVICE_FAIL;
-#endif
-
-  HRESULT hr = DXVAHD_CreateDevice(static_cast<IDirect3DDevice9Ex *>(_d3d9.Device()),
-    &desc, DXVAHD_DEVICE_USAGE_PLAYBACK_NORMAL, NULL, &_dxvaDevice);
-  if(FAILED(hr))
-  {
-    if (hr == E_NOINTERFACE)
-    {
-      log_error("DXVAHD_CreateDevice skipped due to no supported devices!\n");
-      _status = DEVICE_NOTSUPPORTED;
-    }
-    else
-    {
-    log_error("DXVAHD_CreateDevice failed\n");
-    _status = DEVICE_FAIL;
-    }
-  }
-
-  return _status;
-}
-
-void CDXVAWrapper::DXVAHDDestroy()
-{
-  if (_dxvaDevice)
-    _dxvaDevice->Release();
-  _dxvaDevice = 0;
-}
-
-void * CDXVAWrapper::D3D() const
-{
-  return _d3d9.D3D();
-}
-
-unsigned int CDXVAWrapper::AdapterIdx() const
-{
-  return _d3d9.AdapterIdx();
-}
-
-const CD3D9ExWrapper & CDXVAWrapper::D3D9() const
-{
-  return _d3d9;
-}
-
-CD3D9SurfaceWrapper::CD3D9SurfaceWrapper():
-mMem(NULL)
-{
-
-}
-
-CD3D9SurfaceWrapper::CD3D9SurfaceWrapper( IDirect3DSurface9* mem ):
-mMem(mem)
-{
-
-}
-
-CD3D9SurfaceWrapper::~CD3D9SurfaceWrapper()
-{
-  if(mMem != NULL)
-    mMem->Release();
-  mMem = NULL;
-}
-
-#endif
-
-CSurfaceWrapper::CSurfaceWrapper()
-{
-
-}
-
-CSurfaceWrapper::~CSurfaceWrapper()
-{
-
-}
diff --git a/test_extensions/media_sharing/wrappers.h b/test_extensions/media_sharing/wrappers.h
deleted file mode 100644
index 45b70326d1..0000000000
--- a/test_extensions/media_sharing/wrappers.h
+++ /dev/null
@@ -1,197 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef __WRAPPERS_H
-#define __WRAPPERS_H
-
-#if defined(_WIN32)
-#include <d3d9.h>
-#if defined (__MINGW32__)
-#include <rpcsal.h>
-typedef unsigned char UINT8;
-#define __out
-#define __in
-#define __inout
-#define __out_bcount(size)
-#define __out_bcount_opt(size)
-#define __in_opt
-#define __in_ecount(size)
-#define __in_ecount_opt(size)
-#define __out_opt
-#define __out_ecount(size)
-#define __out_ecount_opt(size)
-#define __in_bcount_opt(size)
-#define __inout_opt
-#define __inout_bcount(size)
-#define __in_bcount(size)
-#define __deref_out
-#endif
-#include <dxvahd.h>
-#include <tchar.h>
-#endif
-
-enum TDeviceStatus
-{
-  DEVICE_NOTSUPPORTED,
-  DEVICE_PASS,
-  DEVICE_FAIL,
-};
-
-class CDeviceWrapper {
-public:
-  enum TAccelerationType
-  {
-    ACCELERATION_HW,
-    ACCELERATION_SW,
-  };
-
-  CDeviceWrapper();
-  virtual ~CDeviceWrapper();
-
-  virtual bool AdapterNext() = 0;
-  virtual unsigned int AdapterIdx() const = 0;
-  virtual void *Device() const = 0;
-  virtual TDeviceStatus Status() const = 0;
-  virtual void *D3D() const = 0;
-
-#if defined(_WIN32)
-  HWND WindowHandle() const;
-#endif
-  int WindowWidth() const;
-  int WindowHeight() const;
-  void WindowInit();
-
-
-  static TAccelerationType AccelerationType();
-  static void AccelerationType(TAccelerationType accelerationTypeNew);
-
-private:
-  static LPCTSTR WINDOW_TITLE;
-  static const int WINDOW_WIDTH;
-  static const int WINDOW_HEIGHT;
-  static TAccelerationType accelerationType;
-
-#if defined(_WIN32)
-  HMODULE _hInstance;
-  HWND _hWnd;
-#endif
-
-  void WindowDestroy();
-};
-
-class CSurfaceWrapper
-{
-public:
-  CSurfaceWrapper();
-  virtual ~CSurfaceWrapper();
-};
-
-#if defined(_WIN32)
-//windows specific wrappers
-class CD3D9Wrapper: public CDeviceWrapper {
-public:
-  CD3D9Wrapper();
-  ~CD3D9Wrapper();
-
-  virtual bool AdapterNext();
-  virtual unsigned int AdapterIdx() const;
-  virtual void *Device() const;
-  virtual TDeviceStatus Status() const;
-  virtual void *D3D() const;
-
-private:
-  LPDIRECT3D9 _d3d9;
-  LPDIRECT3DDEVICE9 _d3dDevice;
-  D3DDISPLAYMODE _d3ddm;
-  D3DADAPTER_IDENTIFIER9 _adapter;
-  TDeviceStatus _status;
-  unsigned int _adapterIdx;
-  bool _adapterFound;
-
-  D3DFORMAT Format();
-  D3DADAPTER_IDENTIFIER9 Adapter();
-  int Init();
-  void Destroy();
-};
-
-class CD3D9ExWrapper: public CDeviceWrapper {
-public:
-  CD3D9ExWrapper();
-  ~CD3D9ExWrapper();
-
-  virtual bool AdapterNext();
-  virtual unsigned int AdapterIdx() const;
-  virtual void *Device() const;
-  virtual TDeviceStatus Status() const;
-  virtual void *D3D() const;
-
-private:
-  LPDIRECT3D9EX _d3d9Ex;
-  LPDIRECT3DDEVICE9EX _d3dDeviceEx;
-  D3DDISPLAYMODEEX _d3ddmEx;
-  D3DADAPTER_IDENTIFIER9 _adapter;
-  TDeviceStatus _status;
-  unsigned int _adapterIdx;
-  bool _adapterFound;
-
-  D3DFORMAT Format();
-  D3DADAPTER_IDENTIFIER9 Adapter();
-  int Init();
-  void Destroy();
-};
-
-class CDXVAWrapper: public CDeviceWrapper {
-public:
-  CDXVAWrapper();
-  ~CDXVAWrapper();
-
-  virtual bool AdapterNext();
-  virtual unsigned int AdapterIdx() const;
-  virtual void *Device() const;
-  virtual TDeviceStatus Status() const;
-  virtual void *D3D() const;
-  const CD3D9ExWrapper &D3D9() const;
-
-private:
-  CD3D9ExWrapper _d3d9;
-  IDXVAHD_Device *_dxvaDevice;
-  TDeviceStatus _status;
-  bool _adapterFound;
-
-  static const D3DFORMAT RENDER_TARGET_FORMAT;
-  static const D3DFORMAT VIDEO_FORMAT;
-  static const unsigned int VIDEO_FPS;
-
-  TDeviceStatus DXVAHDInit();
-  void DXVAHDDestroy();
-};
-
-class CD3D9SurfaceWrapper: public CSurfaceWrapper
-{
-public:
-  CD3D9SurfaceWrapper();
-  CD3D9SurfaceWrapper( IDirect3DSurface9* mem );
-  ~CD3D9SurfaceWrapper();
-
-  operator IDirect3DSurface9*() { return mMem; }
-  IDirect3DSurface9* * operator&() { return &mMem; }
-  IDirect3DSurface9* operator->() const { return mMem; }
-
-private:
-  IDirect3DSurface9* mMem;
-};
-#endif
-
-#endif  // __D3D_WRAPPERS

From 8b5d3c205526b7de895ab5b8e4ddf4d108ebf948 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20=C5=81u=C5=BCy=C5=84ski?=
 <sebastian.jozef.luzynski@intel.com>
Date: Thu, 18 Feb 2021 11:07:10 +0100
Subject: [PATCH 048/158] Fix buffer tests memory leaks (#1165)

* Fix buffer tests memory leaks

This change fixes buffer tests, broken by PR #1082. This pull request is similar to #1160

* Fix formatting
---
 test_conformance/buffers/test_buffer_map.cpp  | 25 +++++---
 test_conformance/buffers/test_buffer_read.cpp | 61 ++++++++++++-------
 2 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/test_conformance/buffers/test_buffer_map.cpp b/test_conformance/buffers/test_buffer_map.cpp
index 3cbcd387d7..382c7a3516 100644
--- a/test_conformance/buffers/test_buffer_map.cpp
+++ b/test_conformance/buffers/test_buffer_map.cpp
@@ -554,7 +554,6 @@ static int verify_read_struct( void *ptr, int n )
 static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                                  const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    clMemWrapper buffers[5];
     void        *outptr[5];
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
@@ -593,7 +592,7 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
 
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
-
+            clMemWrapper buffer;
             outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes of memory\n", (int)ptrSizes[i] * num_elements );
@@ -601,18 +600,22 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
             }
 
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, outptr[i], &err);
+                buffer =
+                    clCreateBuffer(context, flag_set[src_flag_id],
+                                   ptrSizes[i] * num_elements, outptr[i], &err);
             else
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
+                buffer = clCreateBuffer(context, flag_set[src_flag_id],
+                                        ptrSizes[i] * num_elements, NULL, &err);
 
-            if (!buffers[i] || err)
+            if (!buffer || err)
             {
                 print_error(err, "clCreateBuffer failed\n" );
                 align_free( outptr[i] );
                 return -1;
             }
 
-            err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
+            err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&buffer);
+
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed\n" );
                 align_free( outptr[i] );
@@ -631,8 +634,11 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
                 return -1;
             }
 
-            mappedPtr = clEnqueueMapBuffer(queue, buffers[i], CL_TRUE, CL_MAP_READ, 0, ptrSizes[i]*num_elements, 0, NULL, NULL, &err);
-            if ( err != CL_SUCCESS ){
+            mappedPtr = clEnqueueMapBuffer(queue, buffer, CL_TRUE, CL_MAP_READ,
+                                           0, ptrSizes[i] * num_elements, 0,
+                                           NULL, NULL, &err);
+            if (err != CL_SUCCESS)
+            {
                 print_error( err, "clEnqueueMapBuffer failed" );
                 align_free( outptr[i] );
                 return -1;
@@ -648,7 +654,8 @@ static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_c
                          1 << i, flag_set_names[src_flag_id]);
             }
 
-            err = clEnqueueUnmapMemObject(queue, buffers[i], mappedPtr, 0, NULL, NULL);
+            err = clEnqueueUnmapMemObject(queue, buffer, mappedPtr, 0, NULL,
+                                          NULL);
             test_error(err, "clEnqueueUnmapMemObject failed");
 
             // If we are using the outptr[i] as backing via USE_HOST_PTR we need to make sure we are done before freeing.
diff --git a/test_conformance/buffers/test_buffer_read.cpp b/test_conformance/buffers/test_buffer_read.cpp
index 0e533fa586..39cf3297e0 100644
--- a/test_conformance/buffers/test_buffer_read.cpp
+++ b/test_conformance/buffers/test_buffer_read.cpp
@@ -621,7 +621,6 @@ static int verify_read_struct(TestStruct *outptr, int n)
 int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                       const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    clMemWrapper buffers[5];
     void        *outptr[5];
     void        *inptr[5];
     clProgramWrapper program[5];
@@ -664,7 +663,7 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
 
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
-
+            clMemWrapper buffer;
             outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)( ptrSizes[i] * num_elements ) );
@@ -678,17 +677,21 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
 
 
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, inptr[i], &err);
+                buffer =
+                    clCreateBuffer(context, flag_set[src_flag_id],
+                                   ptrSizes[i] * num_elements, inptr[i], &err);
             else
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
-            if ( err != CL_SUCCESS ){
+                buffer = clCreateBuffer(context, flag_set[src_flag_id],
+                                        ptrSizes[i] * num_elements, NULL, &err);
+            if (err != CL_SUCCESS)
+            {
                 print_error(err, " clCreateBuffer failed\n" );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
                 return -1;
             }
 
-            err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
+            err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&buffer);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed" );
                 align_free( outptr[i] );
@@ -696,7 +699,8 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
                 return -1;
             }
 
-            err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+            err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL,
+                                         global_work_size, NULL, 0, NULL, NULL);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueNDRangeKernel failed" );
                 align_free( outptr[i] );
@@ -704,7 +708,9 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
                 return -1;
             }
 
-            err = clEnqueueReadBuffer( queue, buffers[i], CL_TRUE, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, NULL );
+            err = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0,
+                                      ptrSizes[i] * num_elements, outptr[i], 0,
+                                      NULL, NULL);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clEnqueueReadBuffer failed" );
                 align_free( outptr[i] );
@@ -722,8 +728,11 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
                          1 << i, flag_set_names[src_flag_id]);
             }
 
-            err = clEnqueueReadBuffer( queue, buffers[i], CL_TRUE, 0, ptrSizes[i]*num_elements, inptr[i], 0, NULL, NULL );
-            if ( err != CL_SUCCESS ){
+            err = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0,
+                                      ptrSizes[i] * num_elements, inptr[i], 0,
+                                      NULL, NULL);
+            if (err != CL_SUCCESS)
+            {
                 print_error( err, "clEnqueueReadBuffer failed" );
                 align_free( outptr[i] );
                 align_free( inptr[i] );
@@ -752,7 +761,6 @@ int test_buffer_read( cl_device_id deviceID, cl_context context, cl_command_queu
 int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                             const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    clMemWrapper buffers[5];
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
     clEventWrapper event;
@@ -796,7 +804,7 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
 
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
-
+            clMemWrapper buffer;
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
@@ -812,9 +820,12 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
 
 
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, inptr[i], &err);
+                buffer =
+                    clCreateBuffer(context, flag_set[src_flag_id],
+                                   ptrSizes[i] * num_elements, inptr[i], &err);
             else
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
+                buffer = clCreateBuffer(context, flag_set[src_flag_id],
+                                        ptrSizes[i] * num_elements, NULL, &err);
             if ( err != CL_SUCCESS ){
                 print_error(err, " clCreateBuffer failed\n" );
                 align_free( outptr[i] );
@@ -822,7 +833,7 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
                 return -1;
             }
 
-            err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
+            err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&buffer);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArg failed" );
                 align_free( outptr[i] );
@@ -839,7 +850,9 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
             }
 
             lastIndex = ( num_elements * ( 1 << i ) - 1 ) * ptrSizes[0];
-            err = clEnqueueReadBuffer( queue, buffers[i], false, 0, ptrSizes[i]*num_elements, outptr[i], 0, NULL, &event );
+            err = clEnqueueReadBuffer(queue, buffer, false, 0,
+                                      ptrSizes[i] * num_elements, outptr[i], 0,
+                                      NULL, &event);
 #ifdef CHECK_FOR_NON_WAIT
             if ( ((uchar *)outptr[i])[lastIndex] ){
                 log_error( "    clEnqueueReadBuffer() possibly returned only after inappropriately waiting for execution to be finished\n" );
@@ -885,7 +898,6 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
 int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
                                     const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
 {
-    clMemWrapper buffers[5];
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
     clEventWrapper event;
@@ -928,7 +940,7 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
 
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
-
+            clMemWrapper buffer;
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
@@ -943,9 +955,12 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
             memset( inptr[i], 0, ptrSizes[i] * num_elements );  // initialize to zero to tell difference
 
             if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, inptr[i], &err);
+                buffer =
+                    clCreateBuffer(context, flag_set[src_flag_id],
+                                   ptrSizes[i] * num_elements, inptr[i], &err);
             else
-                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
+                buffer = clCreateBuffer(context, flag_set[src_flag_id],
+                                        ptrSizes[i] * num_elements, NULL, &err);
             if ( err != CL_SUCCESS ){
                 print_error(err, " clCreateBuffer failed\n" );
                 align_free( outptr[i] );
@@ -953,7 +968,7 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
                 return -1;
             }
 
-            err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
+            err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&buffer);
             if ( err != CL_SUCCESS ){
                 print_error( err, "clSetKernelArgs failed" );
                 align_free( outptr[i] );
@@ -970,7 +985,9 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
             }
 
             lastIndex = ( num_elements * ( 1 << i ) - 1 ) * ptrSizes[0];
-            err = clEnqueueReadBuffer( queue, buffers[i], false, 0, ptrSizes[i]*num_elements, (void *)(outptr[i]), 0, NULL, &event );
+            err = clEnqueueReadBuffer(queue, buffer, false, 0,
+                                      ptrSizes[i] * num_elements,
+                                      (void *)(outptr[i]), 0, NULL, &event);
 #ifdef CHECK_FOR_NON_WAIT
             if ( ((uchar *)outptr[i])[lastIndex] ){
                 log_error( "    clEnqueueReadBuffer() possibly returned only after inappropriately waiting for execution to be finished\n" );

From 84d70e23c8ad2f6d999fce5e39a82c3fe198020d Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 18 Feb 2021 11:11:28 +0000
Subject: [PATCH 049/158] Fix incorrect use of kernel parameter (#1168)

This issue was introduced in 8ad1088a (Reduce difference between files
in math_brute_force (#1138), 2021-02-10).

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary_two_results_i.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index ec00351a12..a20c0571b1 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -132,7 +132,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
-                        "( in1[i], in2[i], out2[i] );\n"
+                        "( in1[i], in2[i], out2 + i );\n"
                         "}\n" };
 
     const char *c3[] = {

From 66eb912ad53f0861e273d38daf50794343cc36d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kevin.petit@arm.com>
Date: Tue, 23 Feb 2021 14:23:18 +0000
Subject: [PATCH 050/158] ci: move code format check out of the main job/script
 (#1154)

A bit cleaner than the written-in-a-haste-to-get-ci-going-again current approach.

Signed-off-by: Kevin Petit <kevin.petit@arm.com>
---
 .github/workflows/presubmit.yml | 21 +++++++++++----------
 presubmit.sh                    |  5 -----
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 85db407db7..0c1778ebe6 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -6,7 +6,6 @@ jobs:
     name: Build ${{ matrix.os }} ${{ matrix.name }}
     runs-on: ${{ matrix.os }}
     env:
-      JOB_CHECK_FORMAT: ${{ matrix.format }}
       JOB_ARCHITECTURE: ${{ matrix.arch }}
       JOB_ENABLE_GL: ${{ matrix.gl }}
     strategy:
@@ -17,10 +16,6 @@ jobs:
           - os: ubuntu-20.04
             mainmatrix: true
             gl: 1
-          - os: ubuntu-20.04
-            mainmatrix: false
-            name: Format
-            format: 1
           - os: ubuntu-20.04
             mainmatrix: false
             name: Arm
@@ -30,11 +25,17 @@ jobs:
             name: AArch64
             arch: aarch64
     steps:
-      - name: Setup
-        run: if [[ "${{matrix.format}}" == "1" ]]; then sudo apt install -y clang-format; fi
       - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
       - name: Build
         run: ./presubmit.sh
-
+  formatcheck:
+    name: Check code format
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Install packages
+        run: sudo apt install -y clang-format
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Check code format
+        run: ./check-format.sh
diff --git a/presubmit.sh b/presubmit.sh
index 646a7f00e3..b0aa934f1f 100755
--- a/presubmit.sh
+++ b/presubmit.sh
@@ -4,11 +4,6 @@ set -e
 
 export TOP=$(pwd)
 
-if [[ "${JOB_CHECK_FORMAT}" == "1" ]]; then
-    ./check-format.sh
-    exit $?
-fi
-
 TOOLCHAIN_URL_arm="https://releases.linaro.org/components/toolchain/binaries/7.5-2019.12/arm-linux-gnueabihf/gcc-linaro-7.5.0-2019.12-x86_64_arm-linux-gnueabihf.tar.xz"
 TOOLCHAIN_URL_aarch64="https://releases.linaro.org/components/toolchain/binaries/7.5-2019.12/aarch64-linux-gnu/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz"
 

From 9a481c6167a199d51b5b38c19399bcb72f70c6d5 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 2 Mar 2021 15:50:14 +0000
Subject: [PATCH 051/158] Split math_brute_force files (#1169)

* Split math_brute_force files

Split each file into two: one covering float and the other covering
double. The goal is to make it possible to diff files to identify bugs
more easily, reduce differences between code for float and double, and
ultimately reduce code duplication in all math_brute_force.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Address clang-format issues

In be936303 (Remove dead code in math_brute_force (#1117), 2021-01-20)
the code was reformatted using git-clang-format, which apparently is less
reliable than clang-format itself when changes occur in large files.

With the previous split of large files, git-clang-format complains about
the format of code originating from binary_two_results_i.cpp.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/CMakeLists.txt           |   39 +-
 .../math_brute_force/binary_double.cpp        |  947 ++++++++++++
 .../{binary.cpp => binary_float.cpp}          |  882 +----------
 .../{binary_i.cpp => binary_i_double.cpp}     |  770 ----------
 .../math_brute_force/binary_i_float.cpp       |  845 +++++++++++
 .../binary_operator_double.cpp                |  911 ++++++++++++
 ...operator.cpp => binary_operator_float.cpp} |  842 -----------
 .../math_brute_force/binary_two_results_i.cpp | 1298 -----------------
 .../binary_two_results_i_double.cpp           |  671 +++++++++
 .../binary_two_results_i_float.cpp            |  657 +++++++++
 .../{i_unary.cpp => i_unary_double.cpp}       |  340 -----
 .../math_brute_force/i_unary_float.cpp        |  370 +++++
 ...cro_binary.cpp => macro_binary_double.cpp} |  773 ----------
 .../math_brute_force/macro_binary_float.cpp   |  832 +++++++++++
 .../math_brute_force/macro_unary_double.cpp   |  598 ++++++++
 ...{macro_unary.cpp => macro_unary_float.cpp} |  541 -------
 .../{mad.cpp => mad_double.cpp}               |  372 -----
 .../math_brute_force/mad_float.cpp            |  402 +++++
 .../math_brute_force/ternary_double.cpp       |  842 +++++++++++
 .../{ternary.cpp => ternary_float.cpp}        |  811 ----------
 .../math_brute_force/unary_double.cpp         |  662 +++++++++
 .../{unary.cpp => unary_float.cpp}            |  594 --------
 .../unary_two_results_double.cpp              |  523 +++++++
 ...esults.cpp => unary_two_results_float.cpp} |  493 -------
 ...s_i.cpp => unary_two_results_i_double.cpp} |  455 ------
 .../unary_two_results_i_float.cpp             |  492 +++++++
 .../math_brute_force/unary_u_double.cpp       |  385 +++++
 .../{unary_u.cpp => unary_u_float.cpp}        |  355 -----
 28 files changed, 9166 insertions(+), 8536 deletions(-)
 create mode 100644 test_conformance/math_brute_force/binary_double.cpp
 rename test_conformance/math_brute_force/{binary.cpp => binary_float.cpp} (55%)
 rename test_conformance/math_brute_force/{binary_i.cpp => binary_i_double.cpp} (54%)
 create mode 100644 test_conformance/math_brute_force/binary_i_float.cpp
 create mode 100644 test_conformance/math_brute_force/binary_operator_double.cpp
 rename test_conformance/math_brute_force/{binary_operator.cpp => binary_operator_float.cpp} (54%)
 delete mode 100644 test_conformance/math_brute_force/binary_two_results_i.cpp
 create mode 100644 test_conformance/math_brute_force/binary_two_results_i_double.cpp
 create mode 100644 test_conformance/math_brute_force/binary_two_results_i_float.cpp
 rename test_conformance/math_brute_force/{i_unary.cpp => i_unary_double.cpp} (52%)
 create mode 100644 test_conformance/math_brute_force/i_unary_float.cpp
 rename test_conformance/math_brute_force/{macro_binary.cpp => macro_binary_double.cpp} (53%)
 create mode 100644 test_conformance/math_brute_force/macro_binary_float.cpp
 create mode 100644 test_conformance/math_brute_force/macro_unary_double.cpp
 rename test_conformance/math_brute_force/{macro_unary.cpp => macro_unary_float.cpp} (52%)
 rename test_conformance/math_brute_force/{mad.cpp => mad_double.cpp} (52%)
 create mode 100644 test_conformance/math_brute_force/mad_float.cpp
 create mode 100644 test_conformance/math_brute_force/ternary_double.cpp
 rename test_conformance/math_brute_force/{ternary.cpp => ternary_float.cpp} (52%)
 create mode 100644 test_conformance/math_brute_force/unary_double.cpp
 rename test_conformance/math_brute_force/{unary.cpp => unary_float.cpp} (58%)
 create mode 100644 test_conformance/math_brute_force/unary_two_results_double.cpp
 rename test_conformance/math_brute_force/{unary_two_results.cpp => unary_two_results_float.cpp} (56%)
 rename test_conformance/math_brute_force/{unary_two_results_i.cpp => unary_two_results_i_double.cpp} (52%)
 create mode 100644 test_conformance/math_brute_force/unary_two_results_i_float.cpp
 create mode 100644 test_conformance/math_brute_force/unary_u_double.cpp
 rename test_conformance/math_brute_force/{unary_u.cpp => unary_u_float.cpp} (54%)

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index 957233caa3..96433945eb 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -1,23 +1,36 @@
 set(MODULE_NAME BRUTEFORCE)
 
 set(${MODULE_NAME}_SOURCES
-    binary.cpp
-    binary_i.cpp
-    binary_operator.cpp
-    binary_two_results_i.cpp
+    binary_double.cpp
+    binary_float.cpp
+    binary_i_double.cpp
+    binary_i_float.cpp
+    binary_operator_double.cpp
+    binary_operator_float.cpp
+    binary_two_results_i_double.cpp
+    binary_two_results_i_float.cpp
     function_list.cpp
-    i_unary.cpp
-    macro_binary.cpp
-    macro_unary.cpp
-    mad.cpp
+    i_unary_double.cpp
+    i_unary_float.cpp
+    macro_binary_double.cpp
+    macro_binary_float.cpp
+    macro_unary_double.cpp
+    macro_unary_float.cpp
+    mad_double.cpp
+    mad_float.cpp
     main.cpp
     reference_math.cpp
     sleep.cpp
-    ternary.cpp
-    unary.cpp
-    unary_two_results.cpp
-    unary_two_results_i.cpp
-    unary_u.cpp
+    ternary_double.cpp
+    ternary_float.cpp
+    unary_double.cpp
+    unary_float.cpp
+    unary_two_results_double.cpp
+    unary_two_results_float.cpp
+    unary_two_results_i_double.cpp
+    unary_two_results_i_float.cpp
+    unary_u_double.cpp
+    unary_u_float.cpp
     utility.cpp
 )
 
diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
new file mode 100644
index 0000000000..7bff9acad5
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -0,0 +1,947 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
+
+static int BuildKernelDouble(const char *name, int vectorSize,
+                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                             bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isFDim;
+    int skipNanInf;
+    int isNextafter;
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
+} TestInfo;
+
+// A table of more difficult cases to get right
+static const double specialValuesDouble[] = {
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
+};
+
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+
+static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
+                                                int isNextafter,
+                                                bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = 0;
+    test_info.isNextafter = isNextafter;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+
+    int isNextafter = job->isNextafter;
+    cl_ulong *t;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
+    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        cl_double *fp = (cl_double *)p;
+        cl_double *fp2 = (cl_double *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesDouble[x];
+            fp2[j] = specialValuesDouble[y];
+            if (++x >= specialValuesDoubleCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesDoubleCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int64(d);
+        p2[j] = genrand_int64(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_ulong *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ulong *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsDoubleResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // nextafter on FTZ platforms may return the smallest
+                    // normal float (2^-126) given a denormal or a zero
+                    // as the first argument. The rationale here is that
+                    // nextafter flushes the argument to zero and then
+                    // returns the next representable number in the
+                    // direction of the second argument, and since
+                    // denorms are considered as zero, the smallest
+                    // normal number is the next representable number.
+                    // In which case, it should have the same sign as the
+                    // second argument.
+                    if (isNextafter)
+                    {
+                        if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
+                        {
+                            cl_double value = copysign(twoToMinus1022, s2[j]);
+                            fail = fail && (test != value);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                    else
+                    {
+                        // retry per section 6.5.3.3
+                        if (IsDoubleSubnormal(s[j]))
+                        {
+                            long double correct2 = func.f_ff(0.0, s2[j]);
+                            long double correct3 = func.f_ff(-0.0, s2[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // try with both args as zero
+                            if (IsDoubleSubnormal(s2[j]))
+                            {
+                                correct2 = func.f_ff(0.0, 0.0);
+                                correct3 = func.f_ff(-0.0, 0.0);
+                                long double correct4 = func.f_ff(0.0, -0.0);
+                                long double correct5 = func.f_ff(-0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps))
+                                        && (!(fabsf(err4) <= ulps))
+                                        && (!(fabsf(err5) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2, ulps)
+                                    || IsDoubleResultSubnormal(correct3, ulps)
+                                    || IsDoubleResultSubnormal(correct4, ulps)
+                                    || IsDoubleResultSubnormal(correct5, ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (IsDoubleSubnormal(s2[j]))
+                        {
+                            long double correct2 = func.f_ff(s[j], 0.0);
+                            long double correct3 = func.f_ff(s[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
+                               "%.13la}: *%.13la vs. %.13la\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j],
+                               test);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
+
+int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    return TestFunc_Double_Double_Double_common(f, d, 0, relaxedMode);
+}
+
+int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d,
+                                            bool relaxedMode)
+{
+    return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode);
+}
diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary_float.cpp
similarity index 55%
rename from test_conformance/math_brute_force/binary.cpp
rename to test_conformance/math_brute_force/binary_float.cpp
index 699c09442d..0ad7b87af2 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -21,7 +21,6 @@
 #include <string.h>
 
 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
-const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -108,94 +107,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -215,16 +126,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -1194,790 +1095,13 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     return error;
 }
 
-// A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN,
-    -INFINITY,
-    -DBL_MAX,
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
-    -1000.,
-    -100.,
-    -4.0,
-    -3.5,
-    -3.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
-    -2.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
-    -2.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
-    -1.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    -1.0,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
-    -0.5,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
-    -0.25,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
-    -DBL_MIN,
-    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
-    -0.0,
-
-    +NAN,
-    +INFINITY,
-    +DBL_MAX,
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
-    +4.0,
-    +3.5,
-    +3.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
-    +2.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
-    +2.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
-    +1.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    +1.0,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
-    +0.5,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
-    +0.25,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
-    +DBL_MIN,
-    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
-    +0.0,
-};
-
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
-
-static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
-                                                int isNextafter,
-                                                bool relaxedMode)
+int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = 0;
-    test_info.isNextafter = isNextafter;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
-    dptr func = job->f->dfunc;
-    int ftz = job->ftz;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-
-    int isNextafter = job->isNextafter;
-    cl_ulong *t;
-    cl_double *r;
-    cl_double *s;
-    cl_double *s2;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
-    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesDoubleCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        cl_double *fp = (cl_double *)p;
-        cl_double *fp2 = (cl_double *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesDouble[x];
-            fp2[j] = specialValuesDouble[y];
-            if (++x >= specialValuesDoubleCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesDoubleCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int64(d);
-        p2[j] = genrand_int64(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
-    s = (cl_double *)gIn + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_ulong *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_ff(s[j], s2[j]);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail && ftz)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsDoubleResultSubnormal(correct, ulps))
-                    {
-                        fail = fail && (test != 0.0f);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // nextafter on FTZ platforms may return the smallest
-                    // normal float (2^-126) given a denormal or a zero
-                    // as the first argument. The rationale here is that
-                    // nextafter flushes the argument to zero and then
-                    // returns the next representable number in the
-                    // direction of the second argument, and since
-                    // denorms are considered as zero, the smallest
-                    // normal number is the next representable number.
-                    // In which case, it should have the same sign as the
-                    // second argument.
-                    if (isNextafter)
-                    {
-                        if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
-                        {
-                            cl_double value = copysign(twoToMinus1022, s2[j]);
-                            fail = fail && (test != value);
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                    else
-                    {
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
-                        {
-                            long double correct2 = func.f_ff(0.0, s2[j]);
-                            long double correct3 = func.f_ff(-0.0, s2[j]);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-
-                            // try with both args as zero
-                            if (IsDoubleSubnormal(s2[j]))
-                            {
-                                correct2 = func.f_ff(0.0, 0.0);
-                                correct3 = func.f_ff(-0.0, 0.0);
-                                long double correct4 = func.f_ff(0.0, -0.0);
-                                long double correct5 = func.f_ff(-0.0, -0.0);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= ulps))
-                                        && (!(fabsf(err3) <= ulps))
-                                        && (!(fabsf(err4) <= ulps))
-                                        && (!(fabsf(err5) <= ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2, ulps)
-                                    || IsDoubleResultSubnormal(correct3, ulps)
-                                    || IsDoubleResultSubnormal(correct4, ulps)
-                                    || IsDoubleResultSubnormal(correct5, ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                        }
-                        else if (IsDoubleSubnormal(s2[j]))
-                        {
-                            long double correct2 = func.f_ff(s[j], 0.0);
-                            long double correct3 = func.f_ff(s[j], -0.0);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                }
-
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                    tinfo->maxErrorValue2 = s2[j];
-                }
-                if (fail)
-                {
-                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
-                               "%.13la}: *%.13la vs. %.13la\n",
-                               name, sizeNames[k], err, s[j], s2[j], r[j],
-                               test);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
-
-int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    return TestFunc_Float_Float_Float_common(f, d, 0, relaxedMode);
-}
-
-int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    return TestFunc_Double_Double_Double_common(f, d, 0, relaxedMode);
-}
+    return TestFunc_Float_Float_Float_common(f, d, 0, relaxedMode);
+}
 
 int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata d,
                                          bool relaxedMode)
 {
     return TestFunc_Float_Float_Float_common(f, d, 1, relaxedMode);
 }
-
-int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d,
-                                            bool relaxedMode)
-{
-    return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode);
-}
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
similarity index 54%
rename from test_conformance/math_brute_force/binary_i.cpp
rename to test_conformance/math_brute_force/binary_i_double.cpp
index 50d14f33ce..4d6cb86097 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -21,91 +21,6 @@
 #include <limits.h>
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global int",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global int* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0, i0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       int3 i0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, i0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
@@ -204,15 +119,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -223,112 +129,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-// A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
-    -NAN,
-    -INFINITY,
-    -FLT_MAX,
-    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
-    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
-    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
-    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
-    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
-    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
-    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
-    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
-    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
-    -1000.f,
-    -100.f,
-    -4.0f,
-    -3.5f,
-    -3.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
-    -2.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
-    -2.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
-    -1.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
-    -1.0f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
-    -0.5f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
-    -0.25f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
-    -FLT_MIN,
-    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
-    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
-    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
-    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
-    -0.0f,
-
-    +NAN,
-    +INFINITY,
-    +FLT_MAX,
-    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
-    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
-    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
-    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
-    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
-    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
-    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
-    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
-    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
-    +1000.f,
-    +100.f,
-    +4.0f,
-    +3.5f,
-    +3.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
-    2.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
-    +2.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
-    1.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
-    +1.0f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
-    +0.5f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
-    +0.25f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
-    +FLT_MIN,
-    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
-    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
-    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
-    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
-};
-
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
-
 static const int specialValuesInt[] = {
     0,           1,           2,          3,          126,        127,
     128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
@@ -373,576 +173,6 @@ typedef struct TestInfo
     // no special values
 } TestInfo;
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000;
-            p2[j] = 3;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_float);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr func = job->f->func;
-    int ftz = job->ftz;
-    float ulps = job->ulps;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-    cl_uint *t = 0;
-    cl_float *r = 0;
-    cl_float *s = 0;
-    cl_int *s2 = 0;
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_uint *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
-    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesIntCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        float *fp = (float *)p;
-        cl_int *ip2 = (cl_int *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesFloat[x];
-            ip2[j] = specialValuesInt[y];
-            ++x;
-            if (x >= specialValuesFloatCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesIntCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref + thread_id * buffer_elements;
-    s = (float *)gIn + thread_id * buffer_elements;
-    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_uint *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_uint *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                float test = ((float *)q)[j];
-                double correct = func.f_fi(s[j], s2[j]);
-                float err = Ulp_Error(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail && ftz)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsFloatResultSubnormal(correct, ulps))
-                    {
-                        fail = fail && (test != 0.0f);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // retry per section 6.5.3.3
-                    if (IsFloatSubnormal(s[j]))
-                    {
-                        double correct2, correct3;
-                        float err2, err3;
-                        correct2 = func.f_fi(0.0, s2[j]);
-                        correct3 = func.f_fi(-0.0, s2[j]);
-                        err2 = Ulp_Error(test, correct2);
-                        err3 = Ulp_Error(test, correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-
-                        // retry per section 6.5.3.4
-                        if (IsFloatResultSubnormal(correct2, ulps)
-                            || IsFloatResultSubnormal(correct3, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                    tinfo->maxErrorValue2 = s2[j];
-                }
-                if (fail)
-                {
-                    vlog_error(
-                        "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
-                        "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n",
-                        name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
-                        s2[j], r[j], ((uint32_t *)r)[j], test,
-                        ((cl_uint *)&test)[0], j);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
-
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
new file mode 100644
index 0000000000..0ff9b57f7b
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -0,0 +1,845 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global int",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global int* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       int3 i0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
+}
+
+// A table of more difficult cases to get right
+static const float specialValuesFloat[] = {
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
+};
+
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+
+static const int specialValuesInt[] = {
+    0,           1,           2,          3,          126,        127,
+    128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
+    -2,          -3,          -126,       -127,       -128,       -0x02000001,
+    -0x04000001, -1465264071, -1488522147
+};
+static size_t specialValuesIntCount =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    // no special values
+} TestInfo;
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        uint32_t *p = (uint32_t *)gIn;
+        uint32_t *p2 = (uint32_t *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        {
+            p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000;
+            p2[j] = 3;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    float ulps = job->ulps;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_uint *t = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_int *s2 = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesIntCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        float *fp = (float *)p;
+        cl_int *ip2 = (cl_int *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesFloat[x];
+            ip2[j] = specialValuesInt[y];
+            ++x;
+            if (x >= specialValuesFloatCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesIntCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int32(d);
+        p2[j] = genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_uint *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_uint *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                float test = ((float *)q)[j];
+                double correct = func.f_fi(s[j], s2[j]);
+                float err = Ulp_Error(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsFloatResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsFloatSubnormal(s[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+                        correct2 = func.f_fi(0.0, s2[j]);
+                        correct3 = func.f_fi(-0.0, s2[j]);
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
+                        "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n",
+                        name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
+                        s2[j], r[j], ((uint32_t *)r)[j], test,
+                        ((cl_uint *)&test)[0], j);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
new file mode 100644
index 0000000000..7f86afde24
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -0,0 +1,911 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, const char *operator_symbol,
+                             int vectorSize, cl_uint kernel_count, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void ",
+                        name,
+                        "_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] =  in1[i] ",
+                        operator_symbol,
+                        " in2[i];\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void ",
+        name,
+        "_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *name;
+    const char *operator_symbol;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->name, info->operator_symbol, i,
+                             info->kernel_count, info->kernels[i],
+                             info->programs + i, info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if the test is being run in relaxed mode, false
+                      // otherwise.
+
+    // no special fields
+} TestInfo;
+
+// A table of more difficult cases to get right
+static const double specialValuesDouble[] = {
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
+};
+
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
+                                           bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex,
+                                       test_info.threadCount,
+                                       test_info.k,
+                                       test_info.programs,
+                                       f->name,
+                                       f->nameInCode,
+                                       relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ulong *t;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
+    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        cl_double *fp = (cl_double *)p;
+        cl_double *fp2 = (cl_double *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesDouble[x];
+            fp2[j] = specialValuesDouble[y];
+            if (++x >= specialValuesDoubleCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesDoubleCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int64(d);
+        p2[j] = genrand_int64(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_ulong *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ulong *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsDoubleResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+
+                    // retry per section 6.5.3.3
+                    if (IsDoubleSubnormal(s[j]))
+                    {
+                        long double correct2 = func.f_ff(0.0, s2[j]);
+                        long double correct3 = func.f_ff(-0.0, s2[j]);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsDoubleSubnormal(s2[j]))
+                        {
+                            correct2 = func.f_ff(0.0, 0.0);
+                            correct3 = func.f_ff(-0.0, 0.0);
+                            long double correct4 = func.f_ff(0.0, -0.0);
+                            long double correct5 = func.f_ff(-0.0, -0.0);
+                            err2 = Bruteforce_Ulp_Error_Double(test, correct2);
+                            err3 = Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err4 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            float err5 =
+                                Bruteforce_Ulp_Error_Double(test, correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps)
+                                || IsDoubleResultSubnormal(correct4, ulps)
+                                || IsDoubleResultSubnormal(correct5, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsDoubleSubnormal(s2[j]))
+                    {
+                        long double correct2 = func.f_ff(s[j], 0.0);
+                        long double correct3 = func.f_ff(s[j], -0.0);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
+                        name, sizeNames[k], err, s[j], s2[j], r[j], test);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
similarity index 54%
rename from test_conformance/math_brute_force/binary_operator.cpp
rename to test_conformance/math_brute_force/binary_operator_float.cpp
index 65756901f6..56b0280c97 100644
--- a/test_conformance/math_brute_force/binary_operator.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -110,98 +110,6 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, const char *operator_symbol,
-                             int vectorSize, cl_uint kernel_count, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void ",
-                        name,
-                        "_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] =  in1[i] ",
-                        operator_symbol,
-                        " in2[i];\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void ",
-        name,
-        "_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       d0 = d0 ",
-        operator_symbol,
-        " d1;\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = d0 ",
-        operator_symbol,
-        " d1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -222,16 +130,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->name, info->operator_symbol, i,
-                             info->kernel_count, info->kernels[i],
-                             info->programs + i, info->relaxedMode);
-}
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -1139,743 +1037,3 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     if (overflow) free(overflow);
     return error;
 }
-
-// A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN,
-    -INFINITY,
-    -DBL_MAX,
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
-    -1000.,
-    -100.,
-    -4.0,
-    -3.5,
-    -3.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
-    -2.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
-    -2.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
-    -1.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    -1.0,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
-    -0.5,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
-    -0.25,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
-    -DBL_MIN,
-    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
-    -0.0,
-
-    +NAN,
-    +INFINITY,
-    +DBL_MAX,
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
-    +4.0,
-    +3.5,
-    +3.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
-    +2.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
-    +2.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
-    +1.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    +1.0,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
-    +0.5,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
-    +0.25,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
-    +DBL_MIN,
-    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
-    +0.0,
-};
-
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
-                                           bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex,
-                                       test_info.threadCount,
-                                       test_info.k,
-                                       test_info.programs,
-                                       f->name,
-                                       f->nameInCode,
-                                       relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
-    dptr func = job->f->dfunc;
-    int ftz = job->ftz;
-    bool relaxedMode = job->relaxedMode;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-    cl_ulong *t;
-    cl_double *r;
-    cl_double *s;
-    cl_double *s2;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
-    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesDoubleCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        cl_double *fp = (cl_double *)p;
-        cl_double *fp2 = (cl_double *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesDouble[x];
-            fp2[j] = specialValuesDouble[y];
-            if (++x >= specialValuesDoubleCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesDoubleCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int64(d);
-        p2[j] = genrand_int64(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
-    s = (cl_double *)gIn + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_ulong *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_ff(s[j], s2[j]);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail && ftz)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsDoubleResultSubnormal(correct, ulps))
-                    {
-                        fail = fail && (test != 0.0f);
-                        if (!fail) err = 0.0f;
-                    }
-
-
-                    // retry per section 6.5.3.3
-                    if (IsDoubleSubnormal(s[j]))
-                    {
-                        long double correct2 = func.f_ff(0.0, s2[j]);
-                        long double correct3 = func.f_ff(-0.0, s2[j]);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct2);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, ulps)
-                            || IsDoubleResultSubnormal(correct3, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // try with both args as zero
-                        if (IsDoubleSubnormal(s2[j]))
-                        {
-                            correct2 = func.f_ff(0.0, 0.0);
-                            correct3 = func.f_ff(-0.0, 0.0);
-                            long double correct4 = func.f_ff(0.0, -0.0);
-                            long double correct5 = func.f_ff(-0.0, -0.0);
-                            err2 = Bruteforce_Ulp_Error_Double(test, correct2);
-                            err3 = Bruteforce_Ulp_Error_Double(test, correct3);
-                            float err4 =
-                                Bruteforce_Ulp_Error_Double(test, correct4);
-                            float err5 =
-                                Bruteforce_Ulp_Error_Double(test, correct5);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps))
-                                    && (!(fabsf(err4) <= ulps))
-                                    && (!(fabsf(err5) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (fabsf(err4) < fabsf(err)) err = err4;
-                            if (fabsf(err5) < fabsf(err)) err = err5;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps)
-                                || IsDoubleResultSubnormal(correct4, ulps)
-                                || IsDoubleResultSubnormal(correct5, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    else if (IsDoubleSubnormal(s2[j]))
-                    {
-                        long double correct2 = func.f_ff(s[j], 0.0);
-                        long double correct3 = func.f_ff(s[j], -0.0);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct2);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, ulps)
-                            || IsDoubleResultSubnormal(correct3, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                    tinfo->maxErrorValue2 = s2[j];
-                }
-                if (fail)
-                {
-                    vlog_error(
-                        "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
-                        name, sizeNames[k], err, s[j], s2[j], r[j], test);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
deleted file mode 100644
index a20c0571b1..0000000000
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ /dev/null
@@ -1,1298 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "function_list.h"
-#include "test_functions.h"
-#include "utility.h"
-
-#include <limits.h>
-#include <string.h>
-
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global int",
-                        sizeNames[vectorSize],
-                        "* out2, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global int* out2, __global float* in, "
-        "__global float* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, &i0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "       vstore3( i0, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, &i0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               out2[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               out2[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global int",
-                        sizeNames[vectorSize],
-                        "* out2, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global int* out2, __global double* in, "
-        "__global double* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, &i0 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "       vstore3( i0, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, &i0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               out2[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               out2[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_kernel *kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
-typedef struct ComputeReferenceInfoF_
-{
-    const float *x;
-    const float *y;
-    float *r;
-    int *i;
-    double (*f_ffpI)(double, double, int *);
-    cl_uint lim;
-    cl_uint count;
-} ComputeReferenceInfoF;
-
-typedef struct ComputeReferenceInfoD_
-{
-    const double *x;
-    const double *y;
-    double *r;
-    int *i;
-    long double (*f_ffpI)(long double, long double, int *);
-    cl_uint lim;
-    cl_uint count;
-} ComputeReferenceInfoD;
-
-static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
-{
-    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
-    cl_uint lim = cri->lim;
-    cl_uint count = cri->count;
-    cl_uint off = jid * count;
-    const float *x = cri->x + off;
-    const float *y = cri->y + off;
-    float *r = cri->r + off;
-    int *i = cri->i + off;
-    double (*f)(double, double, int *) = cri->f_ffpI;
-    cl_uint j;
-
-    if (off + count > lim) count = lim - off;
-
-    for (j = 0; j < count; ++j)
-        r[j] = (float)f((double)x[j], (double)y[j], i + j);
-
-    return CL_SUCCESS;
-}
-
-static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
-{
-    ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
-    cl_uint lim = cri->lim;
-    cl_uint count = cri->count;
-    cl_uint off = jid * count;
-    const double *x = cri->x + off;
-    const double *y = cri->y + off;
-    double *r = cri->r + off;
-    int *i = cri->i + off;
-    long double (*f)(long double, long double, int *) = cri->f_ffpI;
-    cl_uint j;
-
-    if (off + count > lim) count = lim - off;
-
-    Force64BitFPUPrecision();
-
-    for (j = 0; j < count; ++j)
-        r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
-
-    return CL_SUCCESS;
-}
-
-int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    int64_t maxError2 = 0;
-    float maxErrorVal = 0.0f;
-    float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-
-    cl_uint threadCount = GetThreadCount();
-
-    float float_ulps;
-    if (gIsEmbedded)
-        float_ulps = f->float_embedded_ulps;
-    else
-        float_ulps = f->float_ulps;
-
-    int testingRemquo = !strcmp(f->name, "remquo");
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        float *s = (float *)gIn;
-        float *s2 = (float *)gIn2;
-
-        if (threadCount > 1)
-        {
-            ComputeReferenceInfoF cri;
-            cri.x = s;
-            cri.y = s2;
-            cri.r = (float *)gOut_Ref;
-            cri.i = (int *)gOut_Ref2;
-            cri.f_ffpI = f->func.f_ffpI;
-            cri.lim = bufferSize / sizeof(float);
-            cri.count = (cri.lim + threadCount - 1) / threadCount;
-            ThreadPool_Do(ReferenceF, threadCount, &cri);
-        }
-        else
-        {
-            float *r = (float *)gOut_Ref;
-            int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
-        }
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint32_t *t = (uint32_t *)gOut_Ref;
-        int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint32_t *q = (uint32_t *)(gOut[k]);
-                int32_t *q2 = (int32_t *)gOut2[k];
-
-                // Check for exact match to correctly rounded result
-                if (t[j] == q[j] && t2[j] == q2[j]) continue;
-
-                // Check for paired NaNs
-                if ((t[j] & 0x7fffffff) > 0x7f800000
-                    && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
-                    continue;
-
-                float test = ((float *)q)[j];
-                int correct2 = INT_MIN;
-                double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
-                float err = Ulp_Error(test, correct);
-                int64_t iErr;
-
-                // in case of remquo, we only care about the sign and last
-                // seven bits of integer as per the spec.
-                if (testingRemquo)
-                    iErr = (long long)(q2[j] & 0x0000007f)
-                        - (long long)(correct2 & 0x0000007f);
-                else
-                    iErr = (long long)q2[j] - (long long)correct2;
-
-                // For remquo, if y = 0, x is infinite, or either is NaN
-                // then the standard either neglects to say what is returned
-                // in iptr or leaves it undefined or implementation defined.
-                int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
-                    || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j])
-                    || isnan(((float *)gIn)[j]);
-                if (iptrUndefined) iErr = 0;
-
-                int fail = !(fabsf(err) <= float_ulps && iErr == 0);
-                if (ftz && fail)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsFloatResultSubnormal(correct, float_ulps))
-                    {
-                        fail = fail && !(test == 0.0f && iErr == 0);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // retry per section 6.5.3.3
-                    if (IsFloatSubnormal(s[j]))
-                    {
-                        int correct3i, correct4i;
-                        double correct3 =
-                            f->func.f_ffpI(0.0, s2[j], &correct3i);
-                        double correct4 =
-                            f->func.f_ffpI(-0.0, s2[j], &correct4i);
-                        float err2 = Ulp_Error(test, correct3);
-                        float err3 = Ulp_Error(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= float_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsFloatResultSubnormal(correct2, float_ulps)
-                            || IsFloatResultSubnormal(correct3, float_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // try with both args as zero
-                        if (IsFloatSubnormal(s2[j]))
-                        {
-                            int correct7i, correct8i;
-                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
-                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
-                            double correct7 =
-                                f->func.f_ffpI(0.0, -0.0, &correct7i);
-                            double correct8 =
-                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
-                            err2 = Ulp_Error(test, correct3);
-                            err3 = Ulp_Error(test, correct4);
-                            float err4 = Ulp_Error(test, correct7);
-                            float err5 = Ulp_Error(test, correct8);
-                            iErr3 = (long long)q2[j] - (long long)correct3i;
-                            iErr4 = (long long)q2[j] - (long long)correct4i;
-                            int64_t iErr7 =
-                                (long long)q2[j] - (long long)correct7i;
-                            int64_t iErr8 =
-                                (long long)q2[j] - (long long)correct8i;
-                            fail = fail
-                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
-                                    && (!(fabsf(err3) <= float_ulps
-                                          && iErr4 == 0))
-                                    && (!(fabsf(err4) <= float_ulps
-                                          && iErr7 == 0))
-                                    && (!(fabsf(err5) <= float_ulps
-                                          && iErr8 == 0)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (fabsf(err4) < fabsf(err)) err = err4;
-                            if (fabsf(err5) < fabsf(err)) err = err5;
-                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
-                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
-
-                            // retry per section 6.5.3.4
-                            if (IsFloatResultSubnormal(correct3, float_ulps)
-                                || IsFloatResultSubnormal(correct4, float_ulps)
-                                || IsFloatResultSubnormal(correct7, float_ulps)
-                                || IsFloatResultSubnormal(correct8, float_ulps))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0
-                                             || iErr7 == 0 || iErr8 == 0));
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    else if (IsFloatSubnormal(s2[j]))
-                    {
-                        int correct3i, correct4i;
-                        double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i);
-                        double correct4 =
-                            f->func.f_ffpI(s[j], -0.0, &correct4i);
-                        float err2 = Ulp_Error(test, correct3);
-                        float err3 = Ulp_Error(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= float_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsFloatResultSubnormal(correct2, float_ulps)
-                            || IsFloatResultSubnormal(correct3, float_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (llabs(iErr) > maxError2)
-                    {
-                        maxError2 = llabs(iErr);
-                        maxErrorVal2 = s[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error(
-                            "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
-                            "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
-                            "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
-                            f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
-                            ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
-                            ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
-                            ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
-                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
-                            ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
-                        error = -1;
-                        goto exit;
-                    }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
-int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int64_t maxError2 = 0;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal = 0.0f;
-    double maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    cl_uint threadCount = GetThreadCount();
-
-    Force64BitFPUPrecision();
-
-    int testingRemquo = !strcmp(f->name, "remquo");
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *s = (double *)gIn;
-        double *s2 = (double *)gIn2;
-
-        if (threadCount > 1)
-        {
-            ComputeReferenceInfoD cri;
-            cri.x = s;
-            cri.y = s2;
-            cri.r = (double *)gOut_Ref;
-            cri.i = (int *)gOut_Ref2;
-            cri.f_ffpI = f->dfunc.f_ffpI;
-            cri.lim = bufferSize / sizeof(double);
-            cri.count = (cri.lim + threadCount - 1) / threadCount;
-            ThreadPool_Do(ReferenceD, threadCount, &cri);
-        }
-        else
-        {
-            double *r = (double *)gOut_Ref;
-            int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < bufferSize / sizeof(double); j++)
-                r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
-        }
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)gOut[k];
-                int32_t *q2 = (int32_t *)gOut2[k];
-
-                // Check for exact match to correctly rounded result
-                if (t[j] == q[j] && t2[j] == q2[j]) continue;
-
-                // Check for paired NaNs
-                if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
-                    && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
-                    && t2[j] == q2[j])
-                    continue;
-
-                double test = ((double *)q)[j];
-                int correct2 = INT_MIN;
-                long double correct = f->dfunc.f_ffpI(s[j], s2[j], &correct2);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int64_t iErr;
-
-                // in case of remquo, we only care about the sign and last
-                // seven bits of integer as per the spec.
-                if (testingRemquo)
-                    iErr = (long long)(q2[j] & 0x0000007f)
-                        - (long long)(correct2 & 0x0000007f);
-                else
-                    iErr = (long long)q2[j] - (long long)correct2;
-
-                // For remquo, if y = 0, x is infinite, or either is NaN
-                // then the standard either neglects to say what is returned
-                // in iptr or leaves it undefined or implementation defined.
-                int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
-                    || ((double *)gIn2)[j] == 0.0 || isnan(((double *)gIn2)[j])
-                    || isnan(((double *)gIn)[j]);
-                if (iptrUndefined) iErr = 0;
-
-                int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
-                if (ftz && fail)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsDoubleResultSubnormal(correct, f->double_ulps))
-                    {
-                        fail = fail && !(test == 0.0f && iErr == 0);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // retry per section 6.5.3.3
-                    if (IsDoubleSubnormal(s[j]))
-                    {
-                        int correct3i, correct4i;
-                        long double correct3 =
-                            f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
-                        long double correct4 =
-                            f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= f->double_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
-                            || IsDoubleResultSubnormal(correct3,
-                                                       f->double_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // try with both args as zero
-                        if (IsDoubleSubnormal(s2[j]))
-                        {
-                            int correct7i, correct8i;
-                            correct3 = f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
-                            correct4 = f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
-                            long double correct7 =
-                                f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
-                            long double correct8 =
-                                f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
-                            err2 = Bruteforce_Ulp_Error_Double(test, correct3);
-                            err3 = Bruteforce_Ulp_Error_Double(test, correct4);
-                            float err4 =
-                                Bruteforce_Ulp_Error_Double(test, correct7);
-                            float err5 =
-                                Bruteforce_Ulp_Error_Double(test, correct8);
-                            iErr3 = (long long)q2[j] - (long long)correct3i;
-                            iErr4 = (long long)q2[j] - (long long)correct4i;
-                            int64_t iErr7 =
-                                (long long)q2[j] - (long long)correct7i;
-                            int64_t iErr8 =
-                                (long long)q2[j] - (long long)correct8i;
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps
-                                       && iErr3 == 0))
-                                    && (!(fabsf(err3) <= f->double_ulps
-                                          && iErr4 == 0))
-                                    && (!(fabsf(err4) <= f->double_ulps
-                                          && iErr7 == 0))
-                                    && (!(fabsf(err5) <= f->double_ulps
-                                          && iErr8 == 0)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (fabsf(err4) < fabsf(err)) err = err4;
-                            if (fabsf(err5) < fabsf(err)) err = err5;
-                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
-                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct3,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct4,
-                                                           f->double_ulps)
-                                || IsDoubleResultSubnormal(correct7,
-                                                           f->double_ulps)
-                                || IsDoubleResultSubnormal(correct8,
-                                                           f->double_ulps))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0
-                                             || iErr7 == 0 || iErr8 == 0));
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    else if (IsDoubleSubnormal(s2[j]))
-                    {
-                        int correct3i, correct4i;
-                        long double correct3 =
-                            f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
-                        long double correct4 =
-                            f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= f->double_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
-                            || IsDoubleResultSubnormal(correct3,
-                                                       f->double_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (llabs(iErr) > maxError2)
-                    {
-                        maxError2 = llabs(iErr);
-                        maxErrorVal2 = s[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error(
-                            "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, "
-                            "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, "
-                            "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ "
-                            "0x%16.16llx, 0x%8.8x})\n",
-                            f->name, sizeNames[k], err, iErr,
-                            ((double *)gIn)[j], ((double *)gIn2)[j],
-                            ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j],
-                            ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j],
-                            ((cl_ulong *)gOut_Ref)[j],
-                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
-                            ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
-                        error = -1;
-                        goto exit;
-                    }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
new file mode 100644
index 0000000000..5f1ba3b20a
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -0,0 +1,671 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global int* out2, __global double* in, "
+        "__global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+typedef struct ComputeReferenceInfoD_
+{
+    const double *x;
+    const double *y;
+    double *r;
+    int *i;
+    long double (*f_ffpI)(long double, long double, int *);
+    cl_uint lim;
+    cl_uint count;
+} ComputeReferenceInfoD;
+
+static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
+{
+    ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
+    cl_uint lim = cri->lim;
+    cl_uint count = cri->count;
+    cl_uint off = jid * count;
+    const double *x = cri->x + off;
+    const double *y = cri->y + off;
+    double *r = cri->r + off;
+    int *i = cri->i + off;
+    long double (*f)(long double, long double, int *) = cri->f_ffpI;
+    cl_uint j;
+
+    if (off + count > lim) count = lim - off;
+
+    Force64BitFPUPrecision();
+
+    for (j = 0; j < count; ++j)
+        r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
+
+    return CL_SUCCESS;
+}
+
+int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    double maxErrorVal2 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    cl_uint threadCount = GetThreadCount();
+
+    Force64BitFPUPrecision();
+
+    int testingRemquo = !strcmp(f->name, "remquo");
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *s = (double *)gIn;
+        double *s2 = (double *)gIn2;
+
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoD cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (double *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->dfunc.f_ffpI;
+            cri.lim = bufferSize / sizeof(double);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceD, threadCount, &cri);
+        }
+        else
+        {
+            double *r = (double *)gOut_Ref;
+            int *r2 = (int *)gOut_Ref2;
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
+        }
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)gOut[k];
+                int32_t *q2 = (int32_t *)gOut2[k];
+
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
+
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && t2[j] == q2[j])
+                    continue;
+
+                double test = ((double *)q)[j];
+                int correct2 = INT_MIN;
+                long double correct = f->dfunc.f_ffpI(s[j], s2[j], &correct2);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
+                    || ((double *)gIn2)[j] == 0.0 || isnan(((double *)gIn2)[j])
+                    || isnan(((double *)gIn)[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
+                if (ftz && fail)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsDoubleResultSubnormal(correct, f->double_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsDoubleSubnormal(s[j]))
+                    {
+                        int correct3i, correct4i;
+                        long double correct3 =
+                            f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
+                        long double correct4 =
+                            f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= f->double_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
+                            || IsDoubleResultSubnormal(correct3,
+                                                       f->double_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsDoubleSubnormal(s2[j]))
+                        {
+                            int correct7i, correct8i;
+                            correct3 = f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
+                            long double correct7 =
+                                f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
+                            long double correct8 =
+                                f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Bruteforce_Ulp_Error_Double(test, correct3);
+                            err3 = Bruteforce_Ulp_Error_Double(test, correct4);
+                            float err4 =
+                                Bruteforce_Ulp_Error_Double(test, correct7);
+                            float err5 =
+                                Bruteforce_Ulp_Error_Double(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps
+                                       && iErr3 == 0))
+                                    && (!(fabsf(err3) <= f->double_ulps
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= f->double_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= f->double_ulps
+                                          && iErr8 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct3,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct4,
+                                                           f->double_ulps)
+                                || IsDoubleResultSubnormal(correct7,
+                                                           f->double_ulps)
+                                || IsDoubleResultSubnormal(correct8,
+                                                           f->double_ulps))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsDoubleSubnormal(s2[j]))
+                    {
+                        int correct3i, correct4i;
+                        long double correct3 =
+                            f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
+                        long double correct4 =
+                            f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= f->double_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
+                            || IsDoubleResultSubnormal(correct3,
+                                                       f->double_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+                if (fabsf(err) > maxError)
+                {
+                    maxError = fabsf(err);
+                    maxErrorVal = s[j];
+                }
+                if (llabs(iErr) > maxError2)
+                {
+                    maxError2 = llabs(iErr);
+                    maxErrorVal2 = s[j];
+                }
+
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, "
+                        "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, "
+                        "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ "
+                        "0x%16.16llx, 0x%8.8x})\n",
+                        f->name, sizeNames[k], err, iErr, ((double *)gIn)[j],
+                        ((double *)gIn2)[j], ((cl_ulong *)gIn)[j],
+                        ((cl_ulong *)gIn2)[j], ((double *)gOut_Ref)[j],
+                        ((int *)gOut_Ref2)[j], ((cl_ulong *)gOut_Ref)[j],
+                        ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                        ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
new file mode 100644
index 0000000000..4ea7a85dae
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -0,0 +1,657 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in, "
+        "__global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+typedef struct ComputeReferenceInfoF_
+{
+    const float *x;
+    const float *y;
+    float *r;
+    int *i;
+    double (*f_ffpI)(double, double, int *);
+    cl_uint lim;
+    cl_uint count;
+} ComputeReferenceInfoF;
+
+static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+{
+    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
+    cl_uint lim = cri->lim;
+    cl_uint count = cri->count;
+    cl_uint off = jid * count;
+    const float *x = cri->x + off;
+    const float *y = cri->y + off;
+    float *r = cri->r + off;
+    int *i = cri->i + off;
+    double (*f)(double, double, int *) = cri->f_ffpI;
+    cl_uint j;
+
+    if (off + count > lim) count = lim - off;
+
+    for (j = 0; j < count; ++j)
+        r[j] = (float)f((double)x[j], (double)y[j], i + j);
+
+    return CL_SUCCESS;
+}
+
+int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    int64_t maxError2 = 0;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+
+    cl_uint threadCount = GetThreadCount();
+
+    float float_ulps;
+    if (gIsEmbedded)
+        float_ulps = f->float_embedded_ulps;
+    else
+        float_ulps = f->float_ulps;
+
+    int testingRemquo = !strcmp(f->name, "remquo");
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        float *s = (float *)gIn;
+        float *s2 = (float *)gIn2;
+
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoF cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (float *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->func.f_ffpI;
+            cri.lim = bufferSize / sizeof(float);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceF, threadCount, &cri);
+        }
+        else
+        {
+            float *r = (float *)gOut_Ref;
+            int *r2 = (int *)gOut_Ref2;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
+        }
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                int32_t *q2 = (int32_t *)gOut2[k];
+
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
+
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffff) > 0x7f800000
+                    && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
+                    continue;
+
+                float test = ((float *)q)[j];
+                int correct2 = INT_MIN;
+                double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
+                float err = Ulp_Error(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
+                    || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j])
+                    || isnan(((float *)gIn)[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= float_ulps && iErr == 0);
+                if (ftz && fail)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsFloatResultSubnormal(correct, float_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsFloatSubnormal(s[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 =
+                            f->func.f_ffpI(0.0, s2[j], &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(-0.0, s2[j], &correct4i);
+                        float err2 = Ulp_Error(test, correct3);
+                        float err3 = Ulp_Error(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= float_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, float_ulps)
+                            || IsFloatResultSubnormal(correct3, float_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsFloatSubnormal(s2[j]))
+                        {
+                            int correct7i, correct8i;
+                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                            double correct7 =
+                                f->func.f_ffpI(0.0, -0.0, &correct7i);
+                            double correct8 =
+                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Ulp_Error(test, correct3);
+                            err3 = Ulp_Error(test, correct4);
+                            float err4 = Ulp_Error(test, correct7);
+                            float err5 = Ulp_Error(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= float_ulps
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= float_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= float_ulps
+                                          && iErr8 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
+
+                            // retry per section 6.5.3.4
+                            if (IsFloatResultSubnormal(correct3, float_ulps)
+                                || IsFloatResultSubnormal(correct4, float_ulps)
+                                || IsFloatResultSubnormal(correct7, float_ulps)
+                                || IsFloatResultSubnormal(correct8, float_ulps))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsFloatSubnormal(s2[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(s[j], -0.0, &correct4i);
+                        float err2 = Ulp_Error(test, correct3);
+                        float err3 = Ulp_Error(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= float_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, float_ulps)
+                            || IsFloatResultSubnormal(correct3, float_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+                if (fabsf(err) > maxError)
+                {
+                    maxError = fabsf(err);
+                    maxErrorVal = s[j];
+                }
+                if (llabs(iErr) > maxError2)
+                {
+                    maxError2 = llabs(iErr);
+                    maxErrorVal2 = s[j];
+                }
+
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
+                        "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
+                        "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
+                        f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
+                        ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
+                        ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
+                        ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
+                        ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                        ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
similarity index 52%
rename from test_conformance/math_brute_force/i_unary.cpp
rename to test_conformance/math_brute_force/i_unary_double.cpp
index 9418d44def..8cb863b311 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -20,84 +20,6 @@
 
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global int",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in)\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global int* out, __global float* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       int3 i0 = ",
-        name,
-        "( f0 );\n"
-        "       vstore3( i0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       int3 i0 = ",
-        name,
-        "( f0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
@@ -187,15 +109,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -205,259 +118,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // This test is not using ThreadPool so we need to disable FTZ here
-    // for reference computations
-    FPU_mode_type oldMode;
-    DisableFTZ(&oldMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        if (gWimpyMode)
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (cl_uint)i + j * scale;
-        }
-        else
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        int *r = (int *)gOut_Ref;
-        float *s = (float *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            r[j] = f->func.i_f(s[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint32_t *q = (uint32_t *)(gOut[k]);
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j])
-                {
-                    if (ftz && IsFloatSubnormal(s[j]))
-                    {
-                        unsigned int correct0 = f->func.i_f(0.0);
-                        unsigned int correct1 = f->func.i_f(-0.0);
-                        if (q[j] == correct0 || q[j] == correct1) continue;
-                    }
-
-                    uint32_t err = t[j] - q[j];
-                    if (q[j] > t[j]) err = q[j] - t[j];
-                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
-                               "*%d vs. %d\n",
-                               f->name, sizeNames[k], err, ((float *)gIn)[j],
-                               ((cl_uint *)gIn)[j], t[j], q[j]);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    vlog("\n");
-
-exit:
-    RestoreFPState(&oldMode);
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
 int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
new file mode 100644
index 0000000000..feecb54c1e
--- /dev/null
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -0,0 +1,370 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // This test is not using ThreadPool so we need to disable FTZ here
+    // for reference computations
+    FPU_mode_type oldMode;
+    DisableFTZ(&oldMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (cl_uint)i + j * scale;
+        }
+        else
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        int *r = (int *)gOut_Ref;
+        float *s = (float *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = f->func.i_f(s[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    if (ftz && IsFloatSubnormal(s[j]))
+                    {
+                        unsigned int correct0 = f->func.i_f(0.0);
+                        unsigned int correct1 = f->func.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
+                    }
+
+                    uint32_t err = t[j] - q[j];
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
+                               "*%d vs. %d\n",
+                               f->name, sizeNames[k], err, ((float *)gIn)[j],
+                               ((cl_uint *)gIn)[j], t[j], q[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            p[j] = genrand_int32(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    vlog("\n");
+
+exit:
+    RestoreFPState(&oldMode);
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
similarity index 53%
rename from test_conformance/math_brute_force/macro_binary.cpp
rename to test_conformance/math_brute_force/macro_binary_double.cpp
index fb88e6072b..9b5d8f2480 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -20,91 +20,6 @@
 
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global int",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global int* out, __global float* in, __global float* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = ",
-        name,
-        "( f0, f1 );\n"
-        "       vstore3( i0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       int3 i0 = ",
-        name,
-        "( f0, f1 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
@@ -203,15 +118,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -222,112 +128,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-// A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
-    -NAN,
-    -INFINITY,
-    -FLT_MAX,
-    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
-    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
-    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
-    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
-    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
-    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
-    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
-    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
-    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
-    -1000.f,
-    -100.f,
-    -4.0f,
-    -3.5f,
-    -3.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
-    -2.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
-    -2.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
-    -1.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
-    -1.0f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
-    -0.5f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
-    -0.25f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
-    -FLT_MIN,
-    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
-    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
-    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
-    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
-    -0.0f,
-
-    +NAN,
-    +INFINITY,
-    +FLT_MAX,
-    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
-    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
-    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
-    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
-    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
-    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
-    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
-    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
-    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
-    +1000.f,
-    +100.f,
-    +4.0f,
-    +3.5f,
-    +3.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
-    2.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
-    +2.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
-    1.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
-    +1.0f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
-    +0.5f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
-    +0.25f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
-    +FLT_MIN,
-    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
-    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
-    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
-    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
-};
-
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -356,579 +156,6 @@ typedef struct TestInfo
 
 } TestInfo;
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_float);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr func = job->f->func;
-    int ftz = job->ftz;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-    cl_int *t = 0;
-    cl_int *r = 0;
-    cl_float *s = 0;
-    cl_float *s2 = 0;
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_int *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_int *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
-    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesFloatCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        float *fp = (float *)p;
-        float *fp2 = (float *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesFloat[x];
-            fp2[j] = specialValuesFloat[y];
-            ++x;
-            if (x >= specialValuesFloatCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesFloatCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
-    s = (float *)gIn + thread_id * buffer_elements;
-    s2 = (float *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
-
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_int *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                          0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_int *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        cl_int *q = out[0];
-
-        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
-        {
-            if (ftz)
-            {
-                if (IsFloatSubnormal(s[j]))
-                {
-                    if (IsFloatSubnormal(s2[j]))
-                    {
-                        int correct = func.i_ff(0.0f, 0.0f);
-                        int correct2 = func.i_ff(0.0f, -0.0f);
-                        int correct3 = func.i_ff(-0.0f, 0.0f);
-                        int correct4 = func.i_ff(-0.0f, -0.0f);
-
-                        if (correct == q[j] || correct2 == q[j]
-                            || correct3 == q[j] || correct4 == q[j])
-                            continue;
-                    }
-                    else
-                    {
-                        int correct = func.i_ff(0.0f, s2[j]);
-                        int correct2 = func.i_ff(-0.0f, s2[j]);
-                        if (correct == q[j] || correct2 == q[j]) continue;
-                    }
-                }
-                else if (IsFloatSubnormal(s2[j]))
-                {
-                    int correct = func.i_ff(s[j], 0.0f);
-                    int correct2 = func.i_ff(s[j], -0.0f);
-                    if (correct == q[j] || correct2 == q[j]) continue;
-                }
-            }
-
-            uint32_t err = t[j] - q[j];
-            if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
-                       "0x%8.8x (index: %d)\n",
-                       name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
-                       j);
-            error = -1;
-            goto exit;
-        }
-
-        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
-        {
-            q = out[k];
-            // If we aren't getting the correctly rounded result
-            if (-t[j] != q[j])
-            {
-                if (ftz)
-                {
-                    if (IsFloatSubnormal(s[j]))
-                    {
-                        if (IsFloatSubnormal(s2[j]))
-                        {
-                            int correct = -func.i_ff(0.0f, 0.0f);
-                            int correct2 = -func.i_ff(0.0f, -0.0f);
-                            int correct3 = -func.i_ff(-0.0f, 0.0f);
-                            int correct4 = -func.i_ff(-0.0f, -0.0f);
-
-                            if (correct == q[j] || correct2 == q[j]
-                                || correct3 == q[j] || correct4 == q[j])
-                                continue;
-                        }
-                        else
-                        {
-                            int correct = -func.i_ff(0.0f, s2[j]);
-                            int correct2 = -func.i_ff(-0.0f, s2[j]);
-                            if (correct == q[j] || correct2 == q[j]) continue;
-                        }
-                    }
-                    else if (IsFloatSubnormal(s2[j]))
-                    {
-                        int correct = -func.i_ff(s[j], 0.0f);
-                        int correct2 = -func.i_ff(s[j], -0.0f);
-                        if (correct == q[j] || correct2 == q[j]) continue;
-                    }
-                }
-                cl_uint err = -t[j] - q[j];
-                if (q[j] > -t[j]) err = q[j] + t[j];
-                vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
-                           "vs. 0x%8.8x (index: %d)\n",
-                           name, sizeNames[k], err, ((float *)s)[j],
-                           ((float *)s2)[j], -t[j], q[j], j);
-                error = -1;
-                goto exit;
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
new file mode 100644
index 0000000000..ece960373b
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -0,0 +1,832 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in, __global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
+}
+
+// A table of more difficult cases to get right
+static const float specialValuesFloat[] = {
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
+};
+
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_int *t = 0;
+    cl_int *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_int *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        float *fp = (float *)p;
+        float *fp2 = (float *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesFloat[x];
+            fp2[j] = specialValuesFloat[y];
+            ++x;
+            if (x >= specialValuesFloatCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesFloatCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int32(d);
+        p2[j] = genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
+
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                          0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_int *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_int *q = out[0];
+
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            if (ftz)
+            {
+                if (IsFloatSubnormal(s[j]))
+                {
+                    if (IsFloatSubnormal(s2[j]))
+                    {
+                        int correct = func.i_ff(0.0f, 0.0f);
+                        int correct2 = func.i_ff(0.0f, -0.0f);
+                        int correct3 = func.i_ff(-0.0f, 0.0f);
+                        int correct4 = func.i_ff(-0.0f, -0.0f);
+
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
+                            continue;
+                    }
+                    else
+                    {
+                        int correct = func.i_ff(0.0f, s2[j]);
+                        int correct2 = func.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+                else if (IsFloatSubnormal(s2[j]))
+                {
+                    int correct = func.i_ff(s[j], 0.0f);
+                    int correct2 = func.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            uint32_t err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
+                       "0x%8.8x (index: %d)\n",
+                       name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
+                       j);
+            error = -1;
+            goto exit;
+        }
+
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsFloatSubnormal(s[j]))
+                    {
+                        if (IsFloatSubnormal(s2[j]))
+                        {
+                            int correct = -func.i_ff(0.0f, 0.0f);
+                            int correct2 = -func.i_ff(0.0f, -0.0f);
+                            int correct3 = -func.i_ff(-0.0f, 0.0f);
+                            int correct4 = -func.i_ff(-0.0f, -0.0f);
+
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
+                                continue;
+                        }
+                        else
+                        {
+                            int correct = -func.i_ff(0.0f, s2[j]);
+                            int correct2 = -func.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
+                        }
+                    }
+                    else if (IsFloatSubnormal(s2[j]))
+                    {
+                        int correct = -func.i_ff(s[j], 0.0f);
+                        int correct2 = -func.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+                cl_uint err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
+                           "vs. 0x%8.8x (index: %d)\n",
+                           name, sizeNames[k], err, ((float *)s)[j],
+                           ((float *)s2)[j], -t[j], q[j], j);
+                error = -1;
+                goto exit;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
new file mode 100644
index 0000000000..8d80abb4b8
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -0,0 +1,598 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize,
+                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                             bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global long",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global long* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       vstore3( l0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = l0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = l0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
+
+int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        cl_ulong *p = (cl_ulong *)gIn;
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    dptr dfunc = job->f->dfunc;
+    int ftz = job->ftz;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_long *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Write the new values to the input array
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
+    cl_double *s = (cl_double *)p;
+    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+    // Verify data
+    cl_long *t = (cl_long *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_long *q = out[0];
+
+        // If we aren't getting the correctly rounded result
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            // If we aren't getting the correctly rounded result
+            if (ftz)
+            {
+                if (IsDoubleSubnormal(s[j]))
+                {
+                    cl_long correct = dfunc.i_f(+0.0f);
+                    cl_long correct2 = dfunc.i_f(-0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            cl_ulong err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                       name, err, ((double *)gIn)[j], t[j], q[j]);
+            return -1;
+        }
+
+
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsDoubleSubnormal(s[j]))
+                    {
+                        int64_t correct = -dfunc.i_f(+0.0f);
+                        int64_t correct2 = -dfunc.i_f(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+
+                cl_ulong err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error(
+                    "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                    name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
+                return -1;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
similarity index 52%
rename from test_conformance/math_brute_force/macro_unary.cpp
rename to test_conformance/math_brute_force/macro_unary_float.cpp
index e5aa9e70bf..2a37c95ba9 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -100,88 +100,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global long",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global long* out, __global double* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       long3 l0 = ",
-        name,
-        "( d0 );\n"
-        "       vstore3( l0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       long3 l0 = ",
-        name,
-        "( d0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = l0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = l0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -201,16 +119,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -699,452 +607,3 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 
     return ret;
 }
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_ulong *p = (cl_ulong *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    dptr dfunc = job->f->dfunc;
-    int ftz = job->ftz;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_long *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_long *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Write the new values to the input array
-    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        p[j] = DoubleFromUInt32(base + j * scale);
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        return error;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
-    cl_double *s = (cl_double *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_long *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
-    // Verify data
-    cl_long *t = (cl_long *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        cl_long *q = out[0];
-
-        // If we aren't getting the correctly rounded result
-        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
-        {
-            // If we aren't getting the correctly rounded result
-            if (ftz)
-            {
-                if (IsDoubleSubnormal(s[j]))
-                {
-                    cl_long correct = dfunc.i_f(+0.0f);
-                    cl_long correct2 = dfunc.i_f(-0.0f);
-                    if (correct == q[j] || correct2 == q[j]) continue;
-                }
-            }
-
-            cl_ulong err = t[j] - q[j];
-            if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
-                       name, err, ((double *)gIn)[j], t[j], q[j]);
-            return -1;
-        }
-
-
-        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
-        {
-            q = out[k];
-            // If we aren't getting the correctly rounded result
-            if (-t[j] != q[j])
-            {
-                if (ftz)
-                {
-                    if (IsDoubleSubnormal(s[j]))
-                    {
-                        int64_t correct = -dfunc.i_f(+0.0f);
-                        int64_t correct2 = -dfunc.i_f(-0.0f);
-                        if (correct == q[j] || correct2 == q[j]) continue;
-                    }
-                }
-
-                cl_ulong err = -t[j] - q[j];
-                if (q[j] > -t[j]) err = q[j] + t[j];
-                vlog_error(
-                    "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
-                    name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
-                return -1;
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-    return CL_SUCCESS;
-}
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad_double.cpp
similarity index 52%
rename from test_conformance/math_brute_force/mad.cpp
rename to test_conformance/math_brute_force/mad_double.cpp
index 0d8c6d44df..cbbc195123 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -20,97 +20,6 @@
 
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2,  __global float",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global float* in2, "
-        "__global float* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       float3 f2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
@@ -213,15 +122,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -231,278 +131,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    float maxErrorVal = 0.0f;
-    float maxErrorVal2 = 0.0f;
-    float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-            p3[j] = genrand_int32(d);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        float *r = (float *)gOut_Ref;
-        float *s = (float *)gIn;
-        float *s2 = (float *)gIn2;
-        float *s3 = (float *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data -- No verification possible.
-        // MAD is a random number generator.
-        if (0 == (i & 0x0fffffff))
-        {
-            vlog(".");
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-            p3[j] = genrand_int32(d);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
-             maxErrorVal3);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
 int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
new file mode 100644
index 0000000000..2124b268d6
--- /dev/null
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -0,0 +1,402 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2,  __global float",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2, "
+        "__global float* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       float3 f2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    float maxErrorVal3 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+            p3[j] = genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        float *s = (float *)gIn;
+        float *s2 = (float *)gIn2;
+        float *s3 = (float *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data -- No verification possible.
+        // MAD is a random number generator.
+        if (0 == (i & 0x0fffffff))
+        {
+            vlog(".");
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+            p3[j] = genrand_int32(d);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
new file mode 100644
index 0000000000..427f4efd8c
--- /dev/null
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -0,0 +1,842 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+#define CORRECTLY_ROUNDED 0
+#define FLUSHED 1
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2,  __global double",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2, "
+        "__global double* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       double3 d2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+// A table of more difficult cases to get right
+static const double specialValuesDouble[] = {
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
+};
+
+static const size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+
+int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
+                                         bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    double maxErrorVal2 = 0.0f;
+    double maxErrorVal3 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        double *p3 = (double *)gIn3;
+        j = 0;
+        if (i == 0)
+        { // test edge cases
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; j < bufferSize / sizeof(double); j++)
+            {
+                p[j] = specialValuesDouble[x];
+                p2[j] = specialValuesDouble[y];
+                p3[j] = specialValuesDouble[z];
+                if (++x >= specialValuesDoubleCount)
+                {
+                    x = 0;
+                    if (++y >= specialValuesDoubleCount)
+                    {
+                        y = 0;
+                        if (++z >= specialValuesDoubleCount) break;
+                    }
+                }
+            }
+            if (j == bufferSize / sizeof(double))
+                vlog_error("Test Error: not all special cases tested!\n");
+        }
+
+        for (; j < bufferSize / sizeof(double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+            p3[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        double *s = (double *)gIn;
+        double *s2 = (double *)gIn2;
+        double *s3 = (double *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    int fail = !(fabsf(err) <= f->double_ulps);
+
+                    if (fail && ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsDoubleSubnormal(correct))
+                        { // look at me,
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (fail && IsDoubleSubnormal(s[j]))
+                        { // look at me,
+                            long double correct2 =
+                                f->dfunc.f_fff(0.0, s2[j], s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(-0.0, s2[j], s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
+                            { // look at me now,
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // try with first two args as zero
+                            if (IsDoubleSubnormal(s2[j]))
+                            { // its fun to have fun,
+                                correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]);
+                                correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, -0.0, s3[j]);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, -0.0, s3[j]);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+
+                                if (IsDoubleSubnormal(s3[j]))
+                                { // but you have to know how!
+                                    correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f);
+                                    correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f);
+                                    correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f);
+                                    correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f);
+                                    long double correct6 =
+                                        f->dfunc.f_fff(0.0, 0.0, -0.0f);
+                                    long double correct7 =
+                                        f->dfunc.f_fff(-0.0, 0.0, -0.0f);
+                                    long double correct8 =
+                                        f->dfunc.f_fff(0.0, -0.0, -0.0f);
+                                    long double correct9 =
+                                        f->dfunc.f_fff(-0.0, -0.0, -0.0f);
+                                    err2 = Bruteforce_Ulp_Error_Double(
+                                        test, correct2);
+                                    err3 = Bruteforce_Ulp_Error_Double(
+                                        test, correct3);
+                                    err4 = Bruteforce_Ulp_Error_Double(
+                                        test, correct4);
+                                    err5 = Bruteforce_Ulp_Error_Double(
+                                        test, correct5);
+                                    float err6 = Bruteforce_Ulp_Error_Double(
+                                        test, correct6);
+                                    float err7 = Bruteforce_Ulp_Error_Double(
+                                        test, correct7);
+                                    float err8 = Bruteforce_Ulp_Error_Double(
+                                        test, correct8);
+                                    float err9 = Bruteforce_Ulp_Error_Double(
+                                        test, correct9);
+                                    fail = fail
+                                        && ((!(fabsf(err2) <= f->double_ulps))
+                                            && (!(fabsf(err3)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err4)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err6)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err7)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err8)
+                                                  <= f->double_ulps)));
+                                    if (fabsf(err2) < fabsf(err)) err = err2;
+                                    if (fabsf(err3) < fabsf(err)) err = err3;
+                                    if (fabsf(err4) < fabsf(err)) err = err4;
+                                    if (fabsf(err5) < fabsf(err)) err = err5;
+                                    if (fabsf(err6) < fabsf(err)) err = err6;
+                                    if (fabsf(err7) < fabsf(err)) err = err7;
+                                    if (fabsf(err8) < fabsf(err)) err = err8;
+                                    if (fabsf(err9) < fabsf(err)) err = err9;
+
+                                    // retry per section 6.5.3.4
+                                    if (IsDoubleResultSubnormal(correct2,
+                                                                f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct3, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct4, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct5, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct6, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct7, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct8, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct9, f->double_ulps))
+                                    {
+                                        fail = fail && (test != 0.0f);
+                                        if (!fail) err = 0.0f;
+                                    }
+                                }
+                            }
+                            else if (IsDoubleSubnormal(s3[j]))
+                            {
+                                correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0);
+                                correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, s2[j], -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, s2[j], -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsDoubleSubnormal(s2[j]))
+                        {
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], 0.0, s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], -0.0, s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // try with second two args as zero
+                            if (IsDoubleSubnormal(s3[j]))
+                            {
+                                correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0);
+                                correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(s[j], 0.0, -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(s[j], -0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsDoubleSubnormal(s3[j]))
+                        {
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], s2[j], 0.0);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], s2[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                        maxErrorVal2 = s2[j];
+                        maxErrorVal3 = s3[j];
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, "
+                                   "%.13la, %.13la}: *%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err, s[j], s2[j],
+                                   s3[j], ((double *)gOut_Ref)[j], test);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        double *p3 = (double *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+            p3[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary_float.cpp
similarity index 52%
rename from test_conformance/math_brute_force/ternary.cpp
rename to test_conformance/math_brute_force/ternary_float.cpp
index f8908909b5..3b3bde7cc8 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -114,99 +114,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2,  __global double",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2, "
-        "__global double* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       double3 d2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -225,15 +132,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -316,7 +214,6 @@ static const float specialValuesFloat[] = {
 static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
@@ -1077,711 +974,3 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
-// A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN,
-    -INFINITY,
-    -DBL_MAX,
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    -3.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
-    -2.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
-    -2.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
-    -1.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    -1.0,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
-    -DBL_MIN,
-    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
-    -0.0,
-
-    +NAN,
-    +INFINITY,
-    +DBL_MAX,
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    +3.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
-    +2.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
-    +2.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
-    +1.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    +1.0,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
-    +DBL_MIN,
-    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
-    +0.0,
-};
-
-static const size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
-
-
-int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
-                                         bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal = 0.0f;
-    double maxErrorVal2 = 0.0f;
-    double maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        double *p3 = (double *)gIn3;
-        j = 0;
-        if (i == 0)
-        { // test edge cases
-            uint32_t x, y, z;
-            x = y = z = 0;
-            for (; j < bufferSize / sizeof(double); j++)
-            {
-                p[j] = specialValuesDouble[x];
-                p2[j] = specialValuesDouble[y];
-                p3[j] = specialValuesDouble[z];
-                if (++x >= specialValuesDoubleCount)
-                {
-                    x = 0;
-                    if (++y >= specialValuesDoubleCount)
-                    {
-                        y = 0;
-                        if (++z >= specialValuesDoubleCount) break;
-                    }
-                }
-            }
-            if (j == bufferSize / sizeof(double))
-                vlog_error("Test Error: not all special cases tested!\n");
-        }
-
-        for (; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-            p3[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *r = (double *)gOut_Ref;
-        double *s = (double *)gIn;
-        double *s2 = (double *)gIn2;
-        double *s3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)(gOut[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j])
-                {
-                    double test = ((double *)q)[j];
-                    long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]);
-                    float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    int fail = !(fabsf(err) <= f->double_ulps);
-
-                    if (fail && ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleSubnormal(correct))
-                        { // look at me,
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (fail && IsDoubleSubnormal(s[j]))
-                        { // look at me,
-                            long double correct2 =
-                                f->dfunc.f_fff(0.0, s2[j], s3[j]);
-                            long double correct3 =
-                                f->dfunc.f_fff(-0.0, s2[j], s3[j]);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps))
-                                    && (!(fabsf(err3) <= f->double_ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
-                                                           f->double_ulps))
-                            { // look at me now,
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-
-                            // try with first two args as zero
-                            if (IsDoubleSubnormal(s2[j]))
-                            { // its fun to have fun,
-                                correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]);
-                                correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]);
-                                long double correct4 =
-                                    f->dfunc.f_fff(0.0, -0.0, s3[j]);
-                                long double correct5 =
-                                    f->dfunc.f_fff(-0.0, -0.0, s3[j]);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= f->double_ulps))
-                                        && (!(fabsf(err3) <= f->double_ulps))
-                                        && (!(fabsf(err4) <= f->double_ulps))
-                                        && (!(fabsf(err5) <= f->double_ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct3,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct4,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct5,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-
-                                if (IsDoubleSubnormal(s3[j]))
-                                { // but you have to know how!
-                                    correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f);
-                                    correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f);
-                                    correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f);
-                                    correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f);
-                                    long double correct6 =
-                                        f->dfunc.f_fff(0.0, 0.0, -0.0f);
-                                    long double correct7 =
-                                        f->dfunc.f_fff(-0.0, 0.0, -0.0f);
-                                    long double correct8 =
-                                        f->dfunc.f_fff(0.0, -0.0, -0.0f);
-                                    long double correct9 =
-                                        f->dfunc.f_fff(-0.0, -0.0, -0.0f);
-                                    err2 = Bruteforce_Ulp_Error_Double(
-                                        test, correct2);
-                                    err3 = Bruteforce_Ulp_Error_Double(
-                                        test, correct3);
-                                    err4 = Bruteforce_Ulp_Error_Double(
-                                        test, correct4);
-                                    err5 = Bruteforce_Ulp_Error_Double(
-                                        test, correct5);
-                                    float err6 = Bruteforce_Ulp_Error_Double(
-                                        test, correct6);
-                                    float err7 = Bruteforce_Ulp_Error_Double(
-                                        test, correct7);
-                                    float err8 = Bruteforce_Ulp_Error_Double(
-                                        test, correct8);
-                                    float err9 = Bruteforce_Ulp_Error_Double(
-                                        test, correct9);
-                                    fail = fail
-                                        && ((!(fabsf(err2) <= f->double_ulps))
-                                            && (!(fabsf(err3)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err4)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err5)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err5)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err6)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err7)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err8)
-                                                  <= f->double_ulps)));
-                                    if (fabsf(err2) < fabsf(err)) err = err2;
-                                    if (fabsf(err3) < fabsf(err)) err = err3;
-                                    if (fabsf(err4) < fabsf(err)) err = err4;
-                                    if (fabsf(err5) < fabsf(err)) err = err5;
-                                    if (fabsf(err6) < fabsf(err)) err = err6;
-                                    if (fabsf(err7) < fabsf(err)) err = err7;
-                                    if (fabsf(err8) < fabsf(err)) err = err8;
-                                    if (fabsf(err9) < fabsf(err)) err = err9;
-
-                                    // retry per section 6.5.3.4
-                                    if (IsDoubleResultSubnormal(correct2,
-                                                                f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct3, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct4, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct5, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct6, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct7, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct8, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct9, f->double_ulps))
-                                    {
-                                        fail = fail && (test != 0.0f);
-                                        if (!fail) err = 0.0f;
-                                    }
-                                }
-                            }
-                            else if (IsDoubleSubnormal(s3[j]))
-                            {
-                                correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0);
-                                correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0);
-                                long double correct4 =
-                                    f->dfunc.f_fff(0.0, s2[j], -0.0);
-                                long double correct5 =
-                                    f->dfunc.f_fff(-0.0, s2[j], -0.0);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= f->double_ulps))
-                                        && (!(fabsf(err3) <= f->double_ulps))
-                                        && (!(fabsf(err4) <= f->double_ulps))
-                                        && (!(fabsf(err5) <= f->double_ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct3,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct4,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct5,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                        }
-                        else if (fail && IsDoubleSubnormal(s2[j]))
-                        {
-                            long double correct2 =
-                                f->dfunc.f_fff(s[j], 0.0, s3[j]);
-                            long double correct3 =
-                                f->dfunc.f_fff(s[j], -0.0, s3[j]);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps))
-                                    && (!(fabsf(err3) <= f->double_ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
-                                                           f->double_ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-
-                            // try with second two args as zero
-                            if (IsDoubleSubnormal(s3[j]))
-                            {
-                                correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0);
-                                correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0);
-                                long double correct4 =
-                                    f->dfunc.f_fff(s[j], 0.0, -0.0);
-                                long double correct5 =
-                                    f->dfunc.f_fff(s[j], -0.0, -0.0);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= f->double_ulps))
-                                        && (!(fabsf(err3) <= f->double_ulps))
-                                        && (!(fabsf(err4) <= f->double_ulps))
-                                        && (!(fabsf(err5) <= f->double_ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct3,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct4,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct5,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                        }
-                        else if (fail && IsDoubleSubnormal(s3[j]))
-                        {
-                            long double correct2 =
-                                f->dfunc.f_fff(s[j], s2[j], 0.0);
-                            long double correct3 =
-                                f->dfunc.f_fff(s[j], s2[j], -0.0);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps))
-                                    && (!(fabsf(err3) <= f->double_ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
-                                                           f->double_ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                        maxErrorVal2 = s2[j];
-                        maxErrorVal3 = s3[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, "
-                                   "%.13la, %.13la}: *%.13la vs. %.13la\n",
-                                   f->name, sizeNames[k], err, s[j], s2[j],
-                                   s3[j], ((double *)gOut_Ref)[j], test);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        double *p3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-            p3[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
-             maxErrorVal3);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
new file mode 100644
index 0000000000..b97b1943e6
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -0,0 +1,662 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+#if defined(__APPLE__)
+#include <sys/time.h>
+#endif
+
+static int BuildKernelDouble(const char *name, int vectorSize,
+                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                             bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double maxErrorValue; // position of the max error value.  Init to 0.
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isRangeLimited; // 1 if the function is only to be evaluated over a
+                        // range
+    float half_sin_cos_tan_limit;
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
+} TestInfo;
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    cl_uint j, k;
+    cl_int error;
+    int ftz = job->ftz;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Write the new values to the input array
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    cl_double *s = (cl_double *)p;
+    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+
+    // Verify data
+    cl_ulong *t = (cl_ulong *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ulong *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_f(s[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail)
+                {
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsDoubleResultSubnormal(correct, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsDoubleSubnormal(s[j]))
+                        {
+                            long double correct2 = func.f_f(0.0L);
+                            long double correct3 = func.f_f(-0.0L);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                }
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
+                               "(0x%16.16llx): *%.13la vs. %.13la\n",
+                               job->f->name, sizeNames[k], err,
+                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
+                               ((cl_double *)gOut_Ref)[j], test);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, buffer_elements, job->scale, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
+
+int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+#if defined(__APPLE__)
+    struct timeval time_val;
+    gettimeofday(&time_val, NULL);
+    double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
+    double end_time;
+#endif
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+
+#if defined(__APPLE__)
+    gettimeofday(&time_val, NULL);
+    end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
+#endif
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+
+        if (strstr(f->name, "exp"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = (double)genrand_real1(d);
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = fabs(DoubleFromUInt32(genrand_int32(d)));
+        else
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+
+#if defined(__APPLE__)
+    vlog("\t(%2.2f seconds)", end_time - start_time);
+#endif
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary_float.cpp
similarity index 58%
rename from test_conformance/math_brute_force/unary.cpp
rename to test_conformance/math_brute_force/unary_float.cpp
index dc6d56c1c6..4c1bd7ab8a 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -103,88 +103,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -204,16 +122,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -915,505 +823,3 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 
     return CL_SUCCESS;
 }
-
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
-    dptr func = job->f->dfunc;
-    cl_uint j, k;
-    cl_int error;
-    int ftz = job->ftz;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Write the new values to the input array
-    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        p[j] = DoubleFromUInt32(base + j * scale);
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        return error;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            return error;
-        }
-    }
-
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
-    cl_double *s = (cl_double *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
-
-    // Verify data
-    cl_ulong *t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_ulong *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_f(s[j]);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail)
-                {
-                    if (ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleResultSubnormal(correct, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
-                        {
-                            long double correct2 = func.f_f(0.0L);
-                            long double correct3 = func.f_f(-0.0L);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                }
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                }
-                if (fail)
-                {
-                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
-                               "(0x%16.16llx): *%.13la vs. %.13la\n",
-                               job->f->name, sizeNames[k], err,
-                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
-                               ((cl_double *)gOut_Ref)[j], test);
-                    return -1;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, buffer_elements, job->scale, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-    return CL_SUCCESS;
-}
-
-int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-#if defined(__APPLE__)
-    struct timeval time_val;
-    gettimeofday(&time_val, NULL);
-    double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
-    double end_time;
-#endif
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-    test_info.relaxedMode = relaxedMode;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-
-#if defined(__APPLE__)
-    gettimeofday(&time_val, NULL);
-    end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
-#endif
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-
-        if (strstr(f->name, "exp"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = (double)genrand_real1(d);
-        else if (strstr(f->name, "log"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = fabs(DoubleFromUInt32(genrand_int32(d)));
-        else
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-
-#if defined(__APPLE__)
-    vlog("\t(%2.2f seconds)", end_time - start_time);
-#endif
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
new file mode 100644
index 0000000000..779c653ab3
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -0,0 +1,523 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* out2, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       double3 iout = NAN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 iout = NAN;\n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError0 = 0.0f;
+    float maxError1 = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal0 = 0.0f;
+    double maxErrorVal1 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
+        }
+        else
+        {
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        double *r2 = (double *)gOut_Ref2;
+        double *s = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        {
+            long double dd;
+            r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
+            r2[j] = (double)dd;
+        }
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        uint64_t *t2 = (uint64_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+                uint64_t *q2 = (uint64_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j] || t2[j] != q2[j])
+                {
+                    double test = ((double *)q)[j];
+                    double test2 = ((double *)q2)[j];
+                    long double correct2;
+                    long double correct = f->dfunc.f_fpf(s[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
+                    int fail = !(fabsf(err) <= f->double_ulps
+                                 && fabsf(err2) <= f->double_ulps);
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
+                        {
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps))
+                            {
+                                fail = fail && !(test == 0.0f && test2 == 0.0f);
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    err2 = 0.0f;
+                                }
+                            }
+                            else
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && fabsf(err2) <= f->double_ulps);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                        else if (IsDoubleResultSubnormal(correct2,
+                                                         f->double_ulps))
+                        {
+                            fail = fail
+                                && !(test2 == 0.0f
+                                     && fabsf(err) <= f->double_ulps);
+                            if (!fail) err2 = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsDoubleSubnormal(s[j]))
+                        {
+                            long double correct2p, correct2n;
+                            long double correctp =
+                                f->dfunc.f_fpf(0.0, &correct2p);
+                            long double correctn =
+                                f->dfunc.f_fpf(-0.0, &correct2n);
+                            float errp =
+                                Bruteforce_Ulp_Error_Double(test, correctp);
+                            float err2p =
+                                Bruteforce_Ulp_Error_Double(test, correct2p);
+                            float errn =
+                                Bruteforce_Ulp_Error_Double(test, correctn);
+                            float err2n =
+                                Bruteforce_Ulp_Error_Double(test, correct2n);
+                            fail = fail
+                                && ((!(fabsf(errp) <= f->double_ulps))
+                                    && (!(fabsf(err2p) <= f->double_ulps))
+                                    && ((!(fabsf(errn) <= f->double_ulps))
+                                        && (!(fabsf(err2n)
+                                              <= f->double_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correctp,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correctn,
+                                                           f->double_ulps))
+                            {
+                                if (IsDoubleResultSubnormal(correct2p,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct2n,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f && test2 == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
+                                }
+                                else
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && fabsf(err2) <= f->double_ulps);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                            else if (IsDoubleResultSubnormal(correct2p,
+                                                             f->double_ulps)
+                                     || IsDoubleResultSubnormal(correct2n,
+                                                                f->double_ulps))
+                            {
+                                fail = fail
+                                    && !(test2 == 0.0f
+                                         && (fabsf(err) <= f->double_ulps));
+                                if (!fail) err2 = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError0)
+                    {
+                        maxError0 = fabsf(err);
+                        maxErrorVal0 = s[j];
+                    }
+                    if (fabsf(err2) > maxError1)
+                    {
+                        maxError1 = fabsf(err2);
+                        maxErrorVal1 = s[j];
+                    }
+                    if (fail)
+                    {
+                        vlog_error(
+                            "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
+                            "*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
+                            f->name, sizeNames[k], err, err2,
+                            ((double *)gIn)[j], ((double *)gOut_Ref)[j],
+                            ((double *)gOut_Ref2)[j], test, test2);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
similarity index 56%
rename from test_conformance/math_brute_force/unary_two_results.cpp
rename to test_conformance/math_brute_force/unary_two_results_float.cpp
index accebd3a4e..cda80b47d3 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -105,93 +105,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* out2, __global double",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* out2, __global double* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-        "       double3 iout = NAN;\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "       vstore3( iout, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 iout = NAN;\n"
-        "       double3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               out2[3*i+1] = iout.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               out2[3*i] = iout.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -210,15 +123,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
 int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
@@ -752,400 +656,3 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
-int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError0 = 0.0f;
-    float maxError1 = 0.0f;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal0 = 0.0f;
-    double maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        if (gWimpyMode)
-        {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
-        }
-        else
-        {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-                p[j] = DoubleFromUInt32((uint32_t)i + j);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *r = (double *)gOut_Ref;
-        double *r2 = (double *)gOut_Ref2;
-        double *s = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-        {
-            long double dd;
-            r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
-            r2[j] = (double)dd;
-        }
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        uint64_t *t2 = (uint64_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)(gOut[k]);
-                uint64_t *q2 = (uint64_t *)(gOut2[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j] || t2[j] != q2[j])
-                {
-                    double test = ((double *)q)[j];
-                    double test2 = ((double *)q2)[j];
-                    long double correct2;
-                    long double correct = f->dfunc.f_fpf(s[j], &correct2);
-                    float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
-                    int fail = !(fabsf(err) <= f->double_ulps
-                                 && fabsf(err2) <= f->double_ulps);
-                    if (ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
-                        {
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps))
-                            {
-                                fail = fail && !(test == 0.0f && test2 == 0.0f);
-                                if (!fail)
-                                {
-                                    err = 0.0f;
-                                    err2 = 0.0f;
-                                }
-                            }
-                            else
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && fabsf(err2) <= f->double_ulps);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                        else if (IsDoubleResultSubnormal(correct2,
-                                                         f->double_ulps))
-                        {
-                            fail = fail
-                                && !(test2 == 0.0f
-                                     && fabsf(err) <= f->double_ulps);
-                            if (!fail) err2 = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
-                        {
-                            long double correct2p, correct2n;
-                            long double correctp =
-                                f->dfunc.f_fpf(0.0, &correct2p);
-                            long double correctn =
-                                f->dfunc.f_fpf(-0.0, &correct2n);
-                            float errp =
-                                Bruteforce_Ulp_Error_Double(test, correctp);
-                            float err2p =
-                                Bruteforce_Ulp_Error_Double(test, correct2p);
-                            float errn =
-                                Bruteforce_Ulp_Error_Double(test, correctn);
-                            float err2n =
-                                Bruteforce_Ulp_Error_Double(test, correct2n);
-                            fail = fail
-                                && ((!(fabsf(errp) <= f->double_ulps))
-                                    && (!(fabsf(err2p) <= f->double_ulps))
-                                    && ((!(fabsf(errn) <= f->double_ulps))
-                                        && (!(fabsf(err2n)
-                                              <= f->double_ulps))));
-                            if (fabsf(errp) < fabsf(err)) err = errp;
-                            if (fabsf(errn) < fabsf(err)) err = errn;
-                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
-                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correctp,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correctn,
-                                                           f->double_ulps))
-                            {
-                                if (IsDoubleResultSubnormal(correct2p,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct2n,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail
-                                        && !(test == 0.0f && test2 == 0.0f);
-                                    if (!fail) err = err2 = 0.0f;
-                                }
-                                else
-                                {
-                                    fail = fail
-                                        && !(test == 0.0f
-                                             && fabsf(err2) <= f->double_ulps);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                            else if (IsDoubleResultSubnormal(correct2p,
-                                                             f->double_ulps)
-                                     || IsDoubleResultSubnormal(correct2n,
-                                                                f->double_ulps))
-                            {
-                                fail = fail
-                                    && !(test2 == 0.0f
-                                         && (fabsf(err) <= f->double_ulps));
-                                if (!fail) err2 = 0.0f;
-                            }
-                        }
-                    }
-                    if (fabsf(err) > maxError0)
-                    {
-                        maxError0 = fabsf(err);
-                        maxErrorVal0 = s[j];
-                    }
-                    if (fabsf(err2) > maxError1)
-                    {
-                        maxError1 = fabsf(err2);
-                        maxErrorVal1 = s[j];
-                    }
-                    if (fail)
-                    {
-                        vlog_error(
-                            "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
-                            "*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
-                            f->name, sizeNames[k], err, err2,
-                            ((double *)gIn)[j], ((double *)gOut_Ref)[j],
-                            ((double *)gOut_Ref2)[j], test, test2);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
-             maxErrorVal1);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
similarity index 52%
rename from test_conformance/math_brute_force/unary_two_results_i.cpp
rename to test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 2ac083d2f2..3fd616a4aa 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -21,91 +21,6 @@
 #include <limits.h>
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global int",
-                        sizeNames[vectorSize],
-                        "* out2, __global float",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global int* out2, __global float* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       int3 iout = INT_MIN;\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "       vstore3( iout, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       int3 iout = INT_MIN;\n"
-        "       float3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               out2[3*i+1] = iout.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               out2[3*i] = iout.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
@@ -202,15 +117,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -226,367 +132,6 @@ static cl_ulong abs_cl_long(cl_long i)
     return (i ^ mask) - mask;
 }
 
-int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int64_t maxError2 = 0;
-    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    float maxErrorVal = 0.0f;
-    float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
-    cl_ulong maxiError;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    float float_ulps;
-    if (gIsEmbedded)
-        float_ulps = f->float_embedded_ulps;
-    else
-        float_ulps = f->float_ulps;
-
-    maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        if (gWimpyMode)
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j * scale;
-        }
-        else
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        float *r = (float *)gOut_Ref;
-        int *r2 = (int *)gOut_Ref2;
-        float *s = (float *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            r[j] = (float)f->func.f_fpI(s[j], r2 + j);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint32_t *t = (uint32_t *)gOut_Ref;
-        int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint32_t *q = (uint32_t *)(gOut[k]);
-                int32_t *q2 = (int32_t *)(gOut2[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j] || t2[j] != q2[j])
-                {
-                    float test = ((float *)q)[j];
-                    int correct2 = INT_MIN;
-                    double correct = f->func.f_fpI(s[j], &correct2);
-                    float err = Ulp_Error(test, correct);
-                    cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
-                    int fail = !(fabsf(err) <= float_ulps
-                                 && abs_cl_long(iErr) <= maxiError);
-                    if (ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsFloatResultSubnormal(correct, float_ulps))
-                        {
-                            fail = fail && !(test == 0.0f && iErr == 0);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (IsFloatSubnormal(s[j]))
-                        {
-                            int correct5, correct6;
-                            double correct3 = f->func.f_fpI(0.0, &correct5);
-                            double correct4 = f->func.f_fpI(-0.0, &correct6);
-                            float err2 = Ulp_Error(test, correct3);
-                            float err3 = Ulp_Error(test, correct4);
-                            cl_long iErr2 =
-                                (long long)q2[j] - (long long)correct5;
-                            cl_long iErr3 =
-                                (long long)q2[j] - (long long)correct6;
-
-                            // Did +0 work?
-                            if (fabsf(err2) <= float_ulps
-                                && abs_cl_long(iErr2) <= maxiError)
-                            {
-                                err = err2;
-                                iErr = iErr2;
-                                fail = 0;
-                            }
-                            // Did -0 work?
-                            else if (fabsf(err3) <= float_ulps
-                                     && abs_cl_long(iErr3) <= maxiError)
-                            {
-                                err = err3;
-                                iErr = iErr3;
-                                fail = 0;
-                            }
-
-                            // retry per section 6.5.3.4
-                            if (fail
-                                && (IsFloatResultSubnormal(correct2, float_ulps)
-                                    || IsFloatResultSubnormal(correct3,
-                                                              float_ulps)))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (abs_cl_long(iErr2) <= maxiError
-                                             || abs_cl_long(iErr3)
-                                                 <= maxiError));
-                                if (!fail)
-                                {
-                                    err = 0.0f;
-                                    iErr = 0;
-                                }
-                            }
-                        }
-                    }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (llabs(iErr) > maxError2)
-                    {
-                        maxError2 = llabs(iErr);
-                        maxErrorVal2 = s[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
-                                   "*{%a, %d} vs. {%a, %d}\n",
-                                   f->name, sizeNames[k], err, (int)iErr,
-                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
-                                   ((int *)gOut_Ref2)[j], test, q2[j]);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
 int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
new file mode 100644
index 0000000000..82bbb81b3e
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -0,0 +1,492 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 iout = INT_MIN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 iout = INT_MIN;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+static cl_ulong abs_cl_long(cl_long i)
+{
+    cl_long mask = i >> 63;
+    return (i ^ mask) - mask;
+}
+
+int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    cl_ulong maxiError;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    float float_ulps;
+    if (gIsEmbedded)
+        float_ulps = f->float_embedded_ulps;
+    else
+        float_ulps = f->float_ulps;
+
+    maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
+        }
+        else
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        int *r2 = (int *)gOut_Ref2;
+        float *s = (float *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fpI(s[j], r2 + j);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                int32_t *q2 = (int32_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j] || t2[j] != q2[j])
+                {
+                    float test = ((float *)q)[j];
+                    int correct2 = INT_MIN;
+                    double correct = f->func.f_fpI(s[j], &correct2);
+                    float err = Ulp_Error(test, correct);
+                    cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
+                    int fail = !(fabsf(err) <= float_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsFloatResultSubnormal(correct, float_ulps))
+                        {
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsFloatSubnormal(s[j]))
+                        {
+                            int correct5, correct6;
+                            double correct3 = f->func.f_fpI(0.0, &correct5);
+                            double correct4 = f->func.f_fpI(-0.0, &correct6);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            cl_long iErr2 =
+                                (long long)q2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)q2[j] - (long long)correct6;
+
+                            // Did +0 work?
+                            if (fabsf(err2) <= float_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
+                            {
+                                err = err2;
+                                iErr = iErr2;
+                                fail = 0;
+                            }
+                            // Did -0 work?
+                            else if (fabsf(err3) <= float_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
+                            {
+                                err = err3;
+                                iErr = iErr3;
+                                fail = 0;
+                            }
+
+                            // retry per section 6.5.3.4
+                            if (fail
+                                && (IsFloatResultSubnormal(correct2, float_ulps)
+                                    || IsFloatResultSubnormal(correct3,
+                                                              float_ulps)))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    iErr = 0;
+                                }
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if (llabs(iErr) > maxError2)
+                    {
+                        maxError2 = llabs(iErr);
+                        maxErrorVal2 = s[j];
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
+                                   "*{%a, %d} vs. {%a, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
+                                   ((int *)gOut_Ref2)[j], test, q2[j]);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            p[j] = genrand_int32(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
new file mode 100644
index 0000000000..d3b9218695
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -0,0 +1,385 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global ulong",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global ulong* in                 )\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       ulong3 u0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               u0 = (ulong3)( in[3*i], in[3*i+1], "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "       }\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+static cl_ulong random64(MTdata d)
+{
+    return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
+}
+
+int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_ulong *p = (cl_ulong *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        cl_ulong *s = (cl_ulong *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            r[j] = (double)f->dfunc.f_u(s[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_u(s[j]);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    int fail = !(fabsf(err) <= f->double_ulps);
+
+                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
+                    if (fail)
+                    {
+                        if (ftz)
+                        {
+                            // retry per section 6.5.3.2
+                            if (IsDoubleResultSubnormal(correct,
+                                                        f->double_ulps))
+                            {
+                                fail = fail && (test != 0.0);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if (fail)
+                    {
+                        vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
+                                   "*%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err,
+                                   ((uint64_t *)gIn)[j],
+                                   ((double *)gOut_Ref)[j], test);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+
+        for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
similarity index 54%
rename from test_conformance/math_brute_force/unary_u.cpp
rename to test_conformance/math_brute_force/unary_u_float.cpp
index 3b8f1f6908..74b3b76048 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -99,88 +99,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global ulong",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global ulong* in                 )\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
-        "       double3 f0 = ",
-        name,
-        "( u0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       ulong3 u0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, "
-        "0xdeaddeaddeaddeadUL ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               u0 = (ulong3)( in[3*i], in[3*i+1], "
-        "0xdeaddeaddeaddeadUL ); \n"
-        "               break;\n"
-        "       }\n"
-        "       double3 f0 = ",
-        name,
-        "( u0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -199,15 +117,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
 int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
@@ -514,267 +423,3 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
-
-static cl_ulong random64(MTdata d)
-{
-    return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
-}
-
-int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_ulong *p = (cl_ulong *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *r = (double *)gOut_Ref;
-        cl_ulong *s = (cl_ulong *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-            r[j] = (double)f->dfunc.f_u(s[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)(gOut[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j])
-                {
-                    double test = ((double *)q)[j];
-                    long double correct = f->dfunc.f_u(s[j]);
-                    float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    int fail = !(fabsf(err) <= f->double_ulps);
-
-                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                    if (fail)
-                    {
-                        if (ftz)
-                        {
-                            // retry per section 6.5.3.2
-                            if (IsDoubleResultSubnormal(correct,
-                                                        f->double_ulps))
-                            {
-                                fail = fail && (test != 0.0);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (fail)
-                    {
-                        vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
-                                   "*%.13la vs. %.13la\n",
-                                   f->name, sizeNames[k], err,
-                                   ((uint64_t *)gIn)[j],
-                                   ((double *)gOut_Ref)[j], test);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-
-        for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}

From e2fb655e4cdb048a7d0c3dae33126b08630c281f Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 5 Mar 2021 14:24:03 +0000
Subject: [PATCH 052/158] Use C++ headers instead of C headers (#1179)

Remove some unnecessary includes.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary_double.cpp        | 2 +-
 test_conformance/math_brute_force/binary_float.cpp         | 2 +-
 test_conformance/math_brute_force/binary_i_double.cpp      | 4 ++--
 test_conformance/math_brute_force/binary_i_float.cpp       | 4 ++--
 .../math_brute_force/binary_operator_double.cpp            | 2 +-
 .../math_brute_force/binary_operator_float.cpp             | 2 +-
 .../math_brute_force/binary_two_results_i_double.cpp       | 4 ++--
 .../math_brute_force/binary_two_results_i_float.cpp        | 4 ++--
 test_conformance/math_brute_force/i_unary_double.cpp       | 2 +-
 test_conformance/math_brute_force/i_unary_float.cpp        | 2 +-
 test_conformance/math_brute_force/macro_binary_double.cpp  | 2 +-
 test_conformance/math_brute_force/macro_binary_float.cpp   | 2 +-
 test_conformance/math_brute_force/macro_unary_double.cpp   | 2 +-
 test_conformance/math_brute_force/macro_unary_float.cpp    | 2 +-
 test_conformance/math_brute_force/mad_double.cpp           | 2 +-
 test_conformance/math_brute_force/mad_float.cpp            | 2 +-
 test_conformance/math_brute_force/main.cpp                 | 2 +-
 test_conformance/math_brute_force/reference_math.cpp       | 7 ++++---
 test_conformance/math_brute_force/ternary_double.cpp       | 2 +-
 test_conformance/math_brute_force/ternary_float.cpp        | 2 +-
 test_conformance/math_brute_force/unary_double.cpp         | 2 +-
 test_conformance/math_brute_force/unary_float.cpp          | 2 +-
 .../math_brute_force/unary_two_results_double.cpp          | 2 +-
 .../math_brute_force/unary_two_results_float.cpp           | 2 +-
 .../math_brute_force/unary_two_results_i_double.cpp        | 4 ++--
 .../math_brute_force/unary_two_results_i_float.cpp         | 4 ++--
 test_conformance/math_brute_force/unary_u_double.cpp       | 2 +-
 test_conformance/math_brute_force/unary_u_float.cpp        | 2 +-
 test_conformance/math_brute_force/utility.h                | 7 -------
 29 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 7bff9acad5..a4dbd98e54 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 0ad7b87af2..16012961c0 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
 
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 4d6cb86097..bbc5905ebb 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -18,8 +18,8 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <limits.h>
-#include <string.h>
+#include <climits>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index 0ff9b57f7b..ceb79ddfc4 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -18,8 +18,8 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <limits.h>
-#include <string.h>
+#include <climits>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 7f86afde24..939ea6d684 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, const char *operator_symbol,
                              int vectorSize, cl_uint kernel_count, cl_kernel *k,
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index 56b0280c97..efef4fe556 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernel(const char *name, const char *operator_symbol,
                        int vectorSize, cl_uint kernel_count, cl_kernel *k,
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index 5f1ba3b20a..7f22af2a1c 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -18,8 +18,8 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <limits.h>
-#include <string.h>
+#include <climits>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index 4ea7a85dae..1daf2b99ea 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -18,8 +18,8 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <limits.h>
-#include <string.h>
+#include <climits>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index 8cb863b311..f52fc13666 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index feecb54c1e..6d04a24851 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 9b5d8f2480..81bc4d0bcf 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index ece960373b..1b5dc33a70 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 8d80abb4b8..6f1e27184f 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 2a37c95ba9..86000a36a4 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index cbbc195123..1393b1defd 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
index 2124b268d6..eaab29fb6c 100644
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 4be2020fc0..9d0b615ae3 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -20,8 +20,8 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <ctime>
 #include <string>
-#include <time.h>
 
 #include "harness/errorHelpers.h"
 #include "harness/kernelHelpers.h"
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index d7ad4c7f99..cca134d4c4 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -14,12 +14,13 @@
 // limitations under the License.
 //
 
-#include "harness/compat.h"
 #include "reference_math.h"
-#include <limits.h>
+#include "harness/compat.h"
+
+#include <climits>
 
 #if !defined(_WIN32)
-#include <string.h>
+#include <cstring>
 #endif
 
 #include "utility.h"
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 427f4efd8c..dc4bd8c62a 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 #define CORRECTLY_ROUNDED 0
 #define FLUSHED 1
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index 3b3bde7cc8..3b2adf8035 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 #define CORRECTLY_ROUNDED 0
 #define FLUSHED 1
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index b97b1943e6..1ff3d9c852 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 #if defined(__APPLE__)
 #include <sys/time.h>
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 4c1bd7ab8a..52ba0eb521 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 #if defined(__APPLE__)
 #include <sys/time.h>
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index 779c653ab3..e2231a5830 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index cda80b47d3..d85d60f0d0 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 3fd616a4aa..4bf017ce56 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -18,8 +18,8 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <limits.h>
-#include <string.h>
+#include <climits>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index 82bbb81b3e..9bf297edce 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -18,8 +18,8 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <limits.h>
-#include <string.h>
+#include <climits>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index d3b9218695..e0641b6ae6 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 74b3b76048..2ff575265e 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -18,7 +18,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
-#include <string.h>
+#include <cstring>
 
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index dd3c5e5633..894a067593 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -17,13 +17,6 @@
 #define UTILITY_H
 
 #include "harness/compat.h"
-
-#ifdef __APPLE__
-#include <OpenCL/opencl.h>
-#else
-#include <CL/opencl.h>
-#endif
-#include <stdio.h>
 #include "harness/rounding_mode.h"
 #include "harness/fpcontrol.h"
 #include "harness/testHarness.h"

From 3307ebed9a2fb6e225ff91e56da1042a3156d34d Mon Sep 17 00:00:00 2001
From: "jianguang.li" <891528583@qq.com>
Date: Fri, 5 Mar 2021 22:25:09 +0800
Subject: [PATCH 053/158] fix out of boundary when strcpy (#1173)

* fix out of boundary when strcpy, fix #1170

* fix format check failure

* fix format check error
---
 test_conformance/gles/test_fence_sync.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/gles/test_fence_sync.cpp b/test_conformance/gles/test_fence_sync.cpp
index 75e9d358bf..0af91a4622 100644
--- a/test_conformance/gles/test_fence_sync.cpp
+++ b/test_conformance/gles/test_fence_sync.cpp
@@ -160,7 +160,7 @@ GLuint createShaderProgram(GLint *posLoc, GLint *colLoc)
     glAttachShader(program, vpShader);
 
     GLuint fpShader;
-    char* fpstr = (char*)malloc(strlen(fragmentshader));
+    char *fpstr = (char *)malloc(sizeof(fragmentshader));
     strcpy(fpstr, fragmentshader);
     fpShader = glCreateShader(GL_FRAGMENT_SHADER);
     glShaderSource(fpShader, 1, (const GLchar **)&fpstr, NULL);

From afe745f47ea1036e34eaab67023bf29ebc9d95f6 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 9 Mar 2021 09:08:38 +0000
Subject: [PATCH 054/158] Fix typo in array size computation (#1178)

Because specialValuesInt2Count < specialValuesIntCount (when correctly
computed), there was no out-of-bound access when running the tests. This
also means additional cases will be covered now that the typo is fixed.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_i_double.cpp            | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index bbc5905ebb..6839dfb9aa 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -129,15 +129,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-static const int specialValuesInt[] = {
-    0,           1,           2,          3,          126,        127,
-    128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
-    -2,          -3,          -126,       -127,       -128,       -0x02000001,
-    -0x04000001, -1465264071, -1488522147
-};
-static size_t specialValuesIntCount =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -291,8 +282,8 @@ static const int specialValuesInt2[] = { 0,       1,     2,      3,
                                          1022,    1023,  1024,   INT_MIN,
                                          INT_MAX, -1,    -2,     -3,
                                          -1022,   -1023, -11024, -INT_MAX };
-static size_t specialValuesInt2Count =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
+static constexpr size_t specialValuesInt2Count =
+    sizeof(specialValuesInt2) / sizeof(specialValuesInt2[0]);
 
 static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 

From 17632c97367b32ac56774833b2ad36a028b3f02f Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 9 Mar 2021 09:09:10 +0000
Subject: [PATCH 055/158] Remove undesired conversions from size_t to int
 (#1180)

Improve math_brute_force kernels by consistently using size_t to store
the result of get_global_id().

This change was missed in 5d7be40e (Remove undesired conversions from
size_t to int (#1153), 2021-02-11).

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/i_unary_float.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index 6d04a24851..6b4a0f8c25 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -31,7 +31,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in)\n"
                         "{\n"
-                        "   int i = get_global_id(0);\n"
+                        "   size_t i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"

From 6f2cd12a0b8d3772e155f2a2b4d399bd534fe382 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Tue, 9 Mar 2021 09:09:52 +0000
Subject: [PATCH 056/158] Deduplicate logging of pixel differences (#1175)

clCopyImage and clFillImage contain near-duplicate code for logging of
pixel difference errors.  Move this into imageHelpers.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_common/harness/imageHelpers.cpp          | 71 +++++++++++++++++++
 test_common/harness/imageHelpers.h            |  5 ++
 .../images/clCopyImage/test_copy_generic.cpp  | 50 ++-----------
 .../images/clFillImage/test_fill_generic.cpp  | 48 +------------
 4 files changed, 83 insertions(+), 91 deletions(-)

diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index b9cbfe7a1c..72a2f0c03c 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -408,6 +408,77 @@ int get_32_bit_image_format(cl_context context, cl_mem_object_type objType,
     return -1;
 }
 
+void print_first_pixel_difference_error(size_t where, const char *sourcePixel,
+                                        const char *destPixel,
+                                        image_descriptor *imageInfo, size_t y,
+                                        size_t thirdDim)
+{
+    size_t pixel_size = get_pixel_size(imageInfo->format);
+
+    log_error("ERROR: Scanline %d did not verify for image size %d,%d,%d "
+              "pitch %d (extra %d bytes)\n",
+              (int)y, (int)imageInfo->width, (int)imageInfo->height,
+              (int)thirdDim, (int)imageInfo->rowPitch,
+              (int)imageInfo->rowPitch
+                  - (int)imageInfo->width * (int)pixel_size);
+    log_error("Failed at column: %ld   ", where);
+
+    switch (pixel_size)
+    {
+        case 1:
+            log_error("*0x%2.2x vs. 0x%2.2x\n", ((cl_uchar *)sourcePixel)[0],
+                      ((cl_uchar *)destPixel)[0]);
+            break;
+        case 2:
+            log_error("*0x%4.4x vs. 0x%4.4x\n", ((cl_ushort *)sourcePixel)[0],
+                      ((cl_ushort *)destPixel)[0]);
+            break;
+        case 3:
+            log_error("*{0x%2.2x, 0x%2.2x, 0x%2.2x} vs. "
+                      "{0x%2.2x, 0x%2.2x, 0x%2.2x}\n",
+                      ((cl_uchar *)sourcePixel)[0],
+                      ((cl_uchar *)sourcePixel)[1],
+                      ((cl_uchar *)sourcePixel)[2], ((cl_uchar *)destPixel)[0],
+                      ((cl_uchar *)destPixel)[1], ((cl_uchar *)destPixel)[2]);
+            break;
+        case 4:
+            log_error("*0x%8.8x vs. 0x%8.8x\n", ((cl_uint *)sourcePixel)[0],
+                      ((cl_uint *)destPixel)[0]);
+            break;
+        case 6:
+            log_error(
+                "*{0x%4.4x, 0x%4.4x, 0x%4.4x} vs. "
+                "{0x%4.4x, 0x%4.4x, 0x%4.4x}\n",
+                ((cl_ushort *)sourcePixel)[0], ((cl_ushort *)sourcePixel)[1],
+                ((cl_ushort *)sourcePixel)[2], ((cl_ushort *)destPixel)[0],
+                ((cl_ushort *)destPixel)[1], ((cl_ushort *)destPixel)[2]);
+            break;
+        case 8:
+            log_error("*0x%16.16llx vs. 0x%16.16llx\n",
+                      ((cl_ulong *)sourcePixel)[0], ((cl_ulong *)destPixel)[0]);
+            break;
+        case 12:
+            log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
+                      "{0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
+                      ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
+                      ((cl_uint *)sourcePixel)[2], ((cl_uint *)destPixel)[0],
+                      ((cl_uint *)destPixel)[1], ((cl_uint *)destPixel)[2]);
+            break;
+        case 16:
+            log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
+                      "{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
+                      ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
+                      ((cl_uint *)sourcePixel)[2], ((cl_uint *)sourcePixel)[3],
+                      ((cl_uint *)destPixel)[0], ((cl_uint *)destPixel)[1],
+                      ((cl_uint *)destPixel)[2], ((cl_uint *)destPixel)[3]);
+            break;
+        default:
+            log_error("Don't know how to print pixel size of %ld\n",
+                      pixel_size);
+            break;
+    }
+}
+
 int random_log_in_range(int minV, int maxV, MTdata d)
 {
     double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1);
diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h
index 8544fbfa16..aba70508c8 100644
--- a/test_common/harness/imageHelpers.h
+++ b/test_common/harness/imageHelpers.h
@@ -134,6 +134,11 @@ typedef struct
     float p[4];
 } FloatPixel;
 
+void print_first_pixel_difference_error(size_t where, const char *sourcePixel,
+                                        const char *destPixel,
+                                        image_descriptor *imageInfo, size_t y,
+                                        size_t thirdDim);
+
 void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
                    size_t sizes[][3], size_t maxWidth, size_t maxHeight,
                    size_t maxDepth, size_t maxArraySize,
diff --git a/test_conformance/images/clCopyImage/test_copy_generic.cpp b/test_conformance/images/clCopyImage/test_copy_generic.cpp
index 577fa47b32..026916e8cb 100644
--- a/test_conformance/images/clCopyImage/test_copy_generic.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_generic.cpp
@@ -284,7 +284,6 @@ cl_mem create_image( cl_context context, cl_command_queue queue, BufferOwningPtr
     return img;
 }
 
-
 // WARNING -- not thread safe
 BufferOwningPtr<char> srcData;
 BufferOwningPtr<char> dstData;
@@ -548,58 +547,17 @@ int test_copy_image_generic( cl_context context, cl_command_queue queue, image_d
         {
             if( memcmp( sourcePtr, destPtr, scanlineSize ) != 0 )
             {
-                log_error( "ERROR: Scanline %d did not verify for image size %d,%d,%d pitch %d (extra %d bytes)\n", (int)y, (int)dstImageInfo->width, (int)dstImageInfo->height, (int)dstImageInfo->depth, (int)dstImageInfo->rowPitch, (int)dstImageInfo->rowPitch - (int)dstImageInfo->width * (int)get_pixel_size( dstImageInfo->format ) );
-
                 // Find the first missing pixel
                 size_t pixel_size = get_pixel_size( dstImageInfo->format );
                 size_t where = 0;
                 for( where = 0; where < dstImageInfo->width; where++ )
                     if( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) )
                         break;
-                log_error( "Failed at column: %ld   ", where );
-                switch( pixel_size )
-                {
-                    case 1:
-                        log_error( "*0x%2.2x vs. 0x%2.2x\n", ((cl_uchar*)(sourcePtr + pixel_size * where))[0], ((cl_uchar*)(destPtr + pixel_size * where))[0] );
-                        break;
-                    case 2:
-                        log_error( "*0x%4.4x vs. 0x%4.4x\n", ((cl_ushort*)(sourcePtr + pixel_size * where))[0], ((cl_ushort*)(destPtr + pixel_size * where))[0] );
-                        break;
-                    case 3:
-                        log_error( "*{0x%2.2x, 0x%2.2x, 0x%2.2x} vs. {0x%2.2x, 0x%2.2x, 0x%2.2x}\n",
-                                  ((cl_uchar*)(sourcePtr + pixel_size * where))[0], ((cl_uchar*)(sourcePtr + pixel_size * where))[1], ((cl_uchar*)(sourcePtr + pixel_size * where))[2],
-                                  ((cl_uchar*)(destPtr + pixel_size * where))[0], ((cl_uchar*)(destPtr + pixel_size * where))[1], ((cl_uchar*)(destPtr + pixel_size * where))[2]
-                                  );
-                        break;
-                    case 4:
-                        log_error( "*0x%8.8x vs. 0x%8.8x\n", ((cl_uint*)(sourcePtr + pixel_size * where))[0], ((cl_uint*)(destPtr + pixel_size * where))[0] );
-                        break;
-                    case 6:
-                        log_error( "*{0x%4.4x, 0x%4.4x, 0x%4.4x} vs. {0x%4.4x, 0x%4.4x, 0x%4.4x}\n",
-                                  ((cl_ushort*)(sourcePtr + pixel_size * where))[0], ((cl_ushort*)(sourcePtr + pixel_size * where))[1], ((cl_ushort*)(sourcePtr + pixel_size * where))[2],
-                                  ((cl_ushort*)(destPtr + pixel_size * where))[0], ((cl_ushort*)(destPtr + pixel_size * where))[1], ((cl_ushort*)(destPtr + pixel_size * where))[2]
-                                  );
-                        break;
-                    case 8:
-                        log_error( "*0x%16.16llx vs. 0x%16.16llx\n", ((cl_ulong*)(sourcePtr + pixel_size * where))[0], ((cl_ulong*)(destPtr + pixel_size * where))[0] );
-                        break;
-                    case 12:
-                        log_error( "*{0x%8.8x, 0x%8.8x, 0x%8.8x} vs. {0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
-                                  ((cl_uint*)(sourcePtr + pixel_size * where))[0], ((cl_uint*)(sourcePtr + pixel_size * where))[1], ((cl_uint*)(sourcePtr + pixel_size * where))[2],
-                                  ((cl_uint*)(destPtr + pixel_size * where))[0], ((cl_uint*)(destPtr + pixel_size * where))[1], ((cl_uint*)(destPtr + pixel_size * where))[2]
-                                  );
-                        break;
-                    case 16:
-                        log_error( "*{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x} vs. {0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
-                                  ((cl_uint*)(sourcePtr + pixel_size * where))[0], ((cl_uint*)(sourcePtr + pixel_size * where))[1], ((cl_uint*)(sourcePtr + pixel_size * where))[2], ((cl_uint*)(sourcePtr + pixel_size * where))[3],
-                                  ((cl_uint*)(destPtr + pixel_size * where))[0], ((cl_uint*)(destPtr + pixel_size * where))[1], ((cl_uint*)(destPtr + pixel_size * where))[2], ((cl_uint*)(destPtr + pixel_size * where))[3]
-                                  );
-                        break;
-                    default:
-                        log_error( "Don't know how to print pixel size of %ld\n", pixel_size );
-                        break;
-                }
 
+                print_first_pixel_difference_error(
+                    where, sourcePtr + pixel_size * where,
+                    destPtr + pixel_size * where, dstImageInfo, y,
+                    dstImageInfo->depth);
                 return -1;
             }
             sourcePtr += rowPitch;
diff --git a/test_conformance/images/clFillImage/test_fill_generic.cpp b/test_conformance/images/clFillImage/test_fill_generic.cpp
index c598939251..59bf24ad21 100644
--- a/test_conformance/images/clFillImage/test_fill_generic.cpp
+++ b/test_conformance/images/clFillImage/test_fill_generic.cpp
@@ -478,58 +478,16 @@ int test_fill_image_generic( cl_context context, cl_command_queue queue, image_d
 
             if (memcmp( sourcePtr, destPtr, scanlineSize ) != 0)
             {
-                log_error( "ERROR: Scanline %d did not verify for image size %d,%d,%d pitch %d (extra %d bytes)\n", (int)y, (int)imageInfo->width, (int)imageInfo->height, (int)thirdDim, (int)imageInfo->rowPitch, (int)imageInfo->rowPitch - (int)imageInfo->width * (int)get_pixel_size( imageInfo->format ) );
-
                 // Find the first missing pixel
                 size_t pixel_size = get_pixel_size( imageInfo->format );
                 size_t where = 0;
                 for ( where = 0; where < imageInfo->width; where++ )
                     if ( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) )
                         break;
-                log_error( "Failed at column: %ld   ", where );
-                switch ( pixel_size )
-                {
-                case 1:
-                    log_error( "*0x%2.2x vs. 0x%2.2x\n", ((cl_uchar*)(sourcePtr + pixel_size * where))[0], ((cl_uchar*)(destPtr + pixel_size * where))[0] );
-                    break;
-                case 2:
-                    log_error( "*0x%4.4x vs. 0x%4.4x\n", ((cl_ushort*)(sourcePtr + pixel_size * where))[0], ((cl_ushort*)(destPtr + pixel_size * where))[0] );
-                    break;
-                case 3:
-                    log_error( "*{0x%2.2x, 0x%2.2x, 0x%2.2x} vs. {0x%2.2x, 0x%2.2x, 0x%2.2x}\n",
-                               ((cl_uchar*)(sourcePtr + pixel_size * where))[0], ((cl_uchar*)(sourcePtr + pixel_size * where))[1], ((cl_uchar*)(sourcePtr + pixel_size * where))[2],
-                               ((cl_uchar*)(destPtr + pixel_size * where))[0], ((cl_uchar*)(destPtr + pixel_size * where))[1], ((cl_uchar*)(destPtr + pixel_size * where))[2]
-                             );
-                    break;
-                case 4:
-                    log_error( "*0x%8.8x vs. 0x%8.8x\n", ((cl_uint*)(sourcePtr + pixel_size * where))[0], ((cl_uint*)(destPtr + pixel_size * where))[0] );
-                    break;
-                case 6:
-                    log_error( "*{0x%4.4x, 0x%4.4x, 0x%4.4x} vs. {0x%4.4x, 0x%4.4x, 0x%4.4x}\n",
-                               ((cl_ushort*)(sourcePtr + pixel_size * where))[0], ((cl_ushort*)(sourcePtr + pixel_size * where))[1], ((cl_ushort*)(sourcePtr + pixel_size * where))[2],
-                               ((cl_ushort*)(destPtr + pixel_size * where))[0], ((cl_ushort*)(destPtr + pixel_size * where))[1], ((cl_ushort*)(destPtr + pixel_size * where))[2]
-                             );
-                    break;
-                case 8:
-                    log_error( "*0x%16.16llx vs. 0x%16.16llx\n", ((cl_ulong*)(sourcePtr + pixel_size * where))[0], ((cl_ulong*)(destPtr + pixel_size * where))[0] );
-                    break;
-                case 12:
-                    log_error( "*{0x%8.8x, 0x%8.8x, 0x%8.8x} vs. {0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
-                               ((cl_uint*)(sourcePtr + pixel_size * where))[0], ((cl_uint*)(sourcePtr + pixel_size * where))[1], ((cl_uint*)(sourcePtr + pixel_size * where))[2],
-                               ((cl_uint*)(destPtr + pixel_size * where))[0], ((cl_uint*)(destPtr + pixel_size * where))[1], ((cl_uint*)(destPtr + pixel_size * where))[2]
-                             );
-                    break;
-                case 16:
-                    log_error( "*{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x} vs. {0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
-                               ((cl_uint*)(sourcePtr + pixel_size * where))[0], ((cl_uint*)(sourcePtr + pixel_size * where))[1], ((cl_uint*)(sourcePtr + pixel_size * where))[2], ((cl_uint*)(sourcePtr + pixel_size * where))[3],
-                               ((cl_uint*)(destPtr + pixel_size * where))[0], ((cl_uint*)(destPtr + pixel_size * where))[1], ((cl_uint*)(destPtr + pixel_size * where))[2], ((cl_uint*)(destPtr + pixel_size * where))[3]
-                             );
-                    break;
-                default:
-                    log_error( "Don't know how to print pixel size of %ld\n", pixel_size );
-                    break;
-                }
 
+                print_first_pixel_difference_error(
+                    where, sourcePtr + pixel_size * where,
+                    destPtr + pixel_size * where, imageInfo, y, thirdDim);
                 return -1;
             }
 

From a483255e504fb0c11bfabd02d627567d7774145a Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 9 Mar 2021 09:18:40 +0000
Subject: [PATCH 057/158] Fold functions for nextafter (#1176)

* Add assertion to prove isNextafter is true iff the builtin is nextafter()

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Fold functions for nextafter

There is no need to differentiate nextafter() from other binary builtin
functions when creating the vtbl entries.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp         | 18 +++---------------
 .../math_brute_force/binary_float.cpp          | 16 ++--------------
 .../math_brute_force/function_list.cpp         | 10 +---------
 3 files changed, 6 insertions(+), 38 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index a4dbd98e54..fad03ade5a 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -285,9 +285,7 @@ static size_t specialValuesDoubleCount =
 
 static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
-static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
-                                                int isNextafter,
-                                                bool relaxedMode)
+int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
@@ -329,7 +327,8 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
 
     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
     test_info.skipNanInf = 0;
-    test_info.isNextafter = isNextafter;
+    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
+
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -934,14 +933,3 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 exit:
     return error;
 }
-
-int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    return TestFunc_Double_Double_Double_common(f, d, 0, relaxedMode);
-}
-
-int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d,
-                                            bool relaxedMode)
-{
-    return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode);
-}
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 16012961c0..a31bfb2f23 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -273,8 +273,7 @@ typedef struct TestInfo
 
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
-static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
-                                             int isNextafter, bool relaxedMode)
+int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
@@ -318,7 +317,7 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
     test_info.relaxedMode = relaxedMode;
     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
     test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
-    test_info.isNextafter = isNextafter;
+    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
@@ -1094,14 +1093,3 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     if (overflow) free(overflow);
     return error;
 }
-
-int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    return TestFunc_Float_Float_Float_common(f, d, 0, relaxedMode);
-}
-
-int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata d,
-                                         bool relaxedMode)
-{
-    return TestFunc_Float_Float_Float_common(f, d, 1, relaxedMode);
-}
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index ef72a085b6..3edbb4854d 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -58,7 +58,6 @@
 #define unaryF_u NULL
 #define macro_unaryF NULL
 #define binaryF NULL
-#define binaryF_nextafter NULL
 #define binaryOperatorF NULL
 #define binaryF_i NULL
 #define macro_binaryF NULL
@@ -134,12 +133,6 @@ static constexpr vtbl _binary = {
     TestFunc_Double_Double_Double,
 };
 
-static constexpr vtbl _binary_nextafter = {
-    "binary_nextafter",
-    TestFunc_Float_Float_Float_nextafter,
-    TestFunc_Double_Double_Double_nextafter,
-};
-
 static constexpr vtbl _binary_operator = {
     "binaryOperator",
     TestFunc_Float_Float_Float_Operator,
@@ -193,7 +186,6 @@ static constexpr vtbl _mad_tbl = {
 #define unaryF_u &_unary_u
 #define macro_unaryF &_macro_unary
 #define binaryF &_binary
-#define binaryF_nextafter &_binary_nextafter
 #define binaryOperatorF &_binary_operator
 #define binaryF_i &_binary_i
 #define macro_binaryF &_macro_binary
@@ -285,7 +277,7 @@ const Func functionList[] = {
     ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF),
     ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
     ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
-    ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),
+    ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF),
     ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF,
               8192.0f), // in derived mode the ulp error is calculated as
                         // exp2(y*log2(x)) and in non-derived it is the same as

From a53917a37e6c1a38e62ac513ecba4a013b6f876e Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 9 Mar 2021 22:55:33 +0000
Subject: [PATCH 058/158] Move code around to reduce differences (#1185)

Code is moved to reduce the differences between tests for single- and
double-precision.

Improve consistency in double-literal.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp        |   8 +-
 .../math_brute_force/binary_float.cpp         |  79 +--
 .../math_brute_force/binary_i_double.cpp      |   8 +-
 .../math_brute_force/binary_i_float.cpp       |  72 +--
 .../binary_operator_double.cpp                |   4 +-
 .../binary_operator_float.cpp                 |  76 +--
 .../math_brute_force/macro_binary_double.cpp  |   8 +-
 .../math_brute_force/macro_binary_float.cpp   |  58 +--
 .../math_brute_force/ternary_float.cpp        |   2 +-
 .../math_brute_force/unary_double.cpp         | 478 +++++++++---------
 10 files changed, 398 insertions(+), 395 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index fad03ade5a..cbb186ed04 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -186,8 +186,8 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
-    -1000.,
-    -100.,
+    -1000.0,
+    -100.0,
     -4.0,
     -3.5,
     -3.0,
@@ -240,8 +240,8 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
+    +1000.0,
+    +100.0,
     +4.0,
     +3.5,
     +3.0,
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index a31bfb2f23..8dfb9f40fc 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -126,6 +126,45 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isFDim;
+    int skipNanInf;
+    int isNextafter;
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
+} TestInfo;
+
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -226,50 +265,12 @@ static const float specialValuesFloat[] = {
     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
+    +0.0f,
 };
 
 static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-// Thread specific data for a worker thread
-typedef struct ThreadInfo
-{
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    float maxError; // max error value. Init to 0.
-    double
-        maxErrorValue; // position of the max error value (param 1).  Init to 0.
-    double maxErrorValue2; // position of the max error value (param 2).  Init
-                           // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-
-typedef struct TestInfo
-{
-    size_t subBufferSize; // Size of the sub-buffer in elements
-    const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
-    cl_uint threadCount; // Number of worker threads
-    cl_uint jobCount; // Number of jobs
-    cl_uint step; // step between each chunk and the next.
-    cl_uint scale; // stride between individual test values
-    float ulps; // max_allowed ulps
-    int ftz; // non-zero if running in flush to zero mode
-
-    int isFDim;
-    int skipNanInf;
-    int isNextafter;
-    bool relaxedMode; // True if test is running in relaxed mode, false
-                      // otherwise.
-} TestInfo;
 
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 6839dfb9aa..eb2c59e029 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -181,8 +181,8 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
-    -1000.,
-    -100.,
+    -1000.0,
+    -100.0,
     -4.0,
     -3.5,
     -3.0,
@@ -235,8 +235,8 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
+    +1000.0,
+    +100.0,
     +4.0,
     +3.5,
     +3.0,
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index ceb79ddfc4..019c96a5bf 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -125,6 +125,41 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    // no special values
+} TestInfo;
+
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -225,7 +260,7 @@ static const float specialValuesFloat[] = {
     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
+    +0.0f,
 };
 
 static const size_t specialValuesFloatCount =
@@ -240,41 +275,6 @@ static const int specialValuesInt[] = {
 static size_t specialValuesIntCount =
     sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-// Thread specific data for a worker thread
-typedef struct ThreadInfo
-{
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    float maxError; // max error value. Init to 0.
-    double
-        maxErrorValue; // position of the max error value (param 1).  Init to 0.
-    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
-                           // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-
-typedef struct TestInfo
-{
-    size_t subBufferSize; // Size of the sub-buffer in elements
-    const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
-    cl_uint threadCount; // Number of worker threads
-    cl_uint jobCount; // Number of jobs
-    cl_uint step; // step between each chunk and the next.
-    cl_uint scale; // stride between individual test values
-    float ulps; // max_allowed ulps
-    int ftz; // non-zero if running in flush to zero mode
-
-    // no special values
-} TestInfo;
-
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 939ea6d684..a7fb3cd0d3 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -241,8 +241,8 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
+    +1000.0,
+    +100.0,
     +4.0,
     +3.5,
     +3.0,
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index efef4fe556..a9d3b7cf25 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -130,6 +130,43 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if the test is being run in relaxed mode, false
+                      // otherwise.
+
+    // no special fields
+} TestInfo;
+
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -230,49 +267,12 @@ static const float specialValuesFloat[] = {
     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
+    +0.0f,
 };
 
 static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-// Thread specific data for a worker thread
-typedef struct ThreadInfo
-{
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    float maxError; // max error value. Init to 0.
-    double
-        maxErrorValue; // position of the max error value (param 1).  Init to 0.
-    double maxErrorValue2; // position of the max error value (param 2).  Init
-                           // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-
-typedef struct TestInfo
-{
-    size_t subBufferSize; // Size of the sub-buffer in elements
-    const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
-    cl_uint threadCount; // Number of worker threads
-    cl_uint jobCount; // Number of jobs
-    cl_uint step; // step between each chunk and the next.
-    cl_uint scale; // stride between individual test values
-    float ulps; // max_allowed ulps
-    int ftz; // non-zero if running in flush to zero mode
-    bool relaxedMode; // True if the test is being run in relaxed mode, false
-                      // otherwise.
-
-    // no special fields
-} TestInfo;
-
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 81bc4d0bcf..2ea785e62e 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -173,8 +173,8 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
-    -1000.,
-    -100.,
+    -1000.0,
+    -100.0,
     -4.0,
     -3.5,
     -3.0,
@@ -227,8 +227,8 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
+    +1000.0,
+    +100.0,
     +4.0,
     +3.5,
     +3.0,
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index 1b5dc33a70..a61ab6b392 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -124,6 +124,34 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -224,40 +252,12 @@ static const float specialValuesFloat[] = {
     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
+    +0.0f,
 };
 
 static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-// Thread specific data for a worker thread
-typedef struct ThreadInfo
-{
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-
-typedef struct TestInfo
-{
-    size_t subBufferSize; // Size of the sub-buffer in elements
-    const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
-    cl_uint threadCount; // Number of worker threads
-    cl_uint jobCount; // Number of jobs
-    cl_uint step; // step between each chunk and the next.
-    cl_uint scale; // stride between individual test values
-    int ftz; // non-zero if running in flush to zero mode
-
-} TestInfo;
-
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index 3b2adf8035..1fcdc4a2c4 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -208,7 +208,7 @@ static const float specialValuesFloat[] = {
     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
+    +0.0f,
 };
 
 static const size_t specialValuesFloatCount =
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index 1ff3d9c852..99959ae30f 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -160,244 +160,7 @@ typedef struct TestInfo
                       // otherwise.
 } TestInfo;
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
-    dptr func = job->f->dfunc;
-    cl_uint j, k;
-    cl_int error;
-    int ftz = job->ftz;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Write the new values to the input array
-    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        p[j] = DoubleFromUInt32(base + j * scale);
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        return error;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            return error;
-        }
-    }
-
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
-    cl_double *s = (cl_double *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
-
-    // Verify data
-    cl_ulong *t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_ulong *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_f(s[j]);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail)
-                {
-                    if (ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleResultSubnormal(correct, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
-                        {
-                            long double correct2 = func.f_f(0.0L);
-                            long double correct3 = func.f_f(-0.0L);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                }
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                }
-                if (fail)
-                {
-                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
-                               "(0x%16.16llx): *%.13la vs. %.13la\n",
-                               job->f->name, sizeNames[k], err,
-                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
-                               ((cl_double *)gOut_Ref)[j], test);
-                    return -1;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, buffer_elements, job->scale, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-    return CL_SUCCESS;
-}
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -660,3 +423,242 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
     return error;
 }
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    cl_uint j, k;
+    cl_int error;
+    int ftz = job->ftz;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Write the new values to the input array
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    cl_double *s = (cl_double *)p;
+    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+
+    // Verify data
+    cl_ulong *t = (cl_ulong *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ulong *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_f(s[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail)
+                {
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsDoubleResultSubnormal(correct, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsDoubleSubnormal(s[j]))
+                        {
+                            long double correct2 = func.f_f(0.0L);
+                            long double correct3 = func.f_f(-0.0L);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                }
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
+                               "(0x%16.16llx): *%.13la vs. %.13la\n",
+                               job->f->name, sizeNames[k], err,
+                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
+                               ((cl_double *)gOut_Ref)[j], test);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, buffer_elements, job->scale, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}

From 68ee30fb4bcfaf59af2039375321745f2d36c70f Mon Sep 17 00:00:00 2001
From: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com>
Date: Tue, 9 Mar 2021 22:57:49 +0000
Subject: [PATCH 059/158] Fix possible size_t overflow in 32-bit builds.
 (#1131)

* Fix possible size_t overflow in 32-bit builds.

Use cl_ulong temporary values for row/slice_pitch.

Signed-off-by: John Kesapides <john.kesapides@arm.com>

* Remove redundant casts

Signed-off-by: John Kesapides <john.kesapides@arm.com>
---
 .../kernel_image_methods/test_2D_array.cpp    | 20 ++++++++++++-------
 .../images/kernel_image_methods/test_3D.cpp   | 20 ++++++++++++-------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/test_conformance/images/kernel_image_methods/test_2D_array.cpp b/test_conformance/images/kernel_image_methods/test_2D_array.cpp
index 79248dd530..21a6b049d0 100644
--- a/test_conformance/images/kernel_image_methods/test_2D_array.cpp
+++ b/test_conformance/images/kernel_image_methods/test_2D_array.cpp
@@ -244,6 +244,9 @@ int test_get_image_info_2D_array(cl_device_id device, cl_context context,
         for( int i = 0; i < NUM_IMAGE_ITERATIONS; i++ )
         {
             cl_ulong size;
+            cl_ulong slicePitch;
+            cl_ulong rowPitch;
+
             // Loop until we get a size that a) will fit in the max alloc size and b) that an allocation of that
             // image, the result array, plus offset arrays, will fit in the global ram space
             do
@@ -252,23 +255,26 @@ int test_get_image_info_2D_array(cl_device_id device, cl_context context,
                 imageInfo.height = (size_t)random_log_in_range( 16, (int)maxHeight / 32, seed );
                 imageInfo.arraySize = (size_t)random_log_in_range( 16, (int)maxArraySize / 32, seed );
 
-                imageInfo.rowPitch = imageInfo.width * pixelSize;
-                imageInfo.slicePitch = imageInfo.rowPitch * imageInfo.height;
+                rowPitch = imageInfo.width * pixelSize;
+                slicePitch = rowPitch * imageInfo.height;
 
                 size_t extraWidth = (int)random_log_in_range( 0, 64, seed );
-                imageInfo.rowPitch += extraWidth;
+                rowPitch += extraWidth;
 
                 do {
                     extraWidth++;
-                    imageInfo.rowPitch += extraWidth;
-                } while ((imageInfo.rowPitch % pixelSize) != 0);
+                    rowPitch += extraWidth;
+                } while ((rowPitch % pixelSize) != 0);
 
                 size_t extraHeight = (int)random_log_in_range( 0, 8, seed );
-                imageInfo.slicePitch = imageInfo.rowPitch * (imageInfo.height + extraHeight);
+                slicePitch = rowPitch * (imageInfo.height + extraHeight);
 
-                size = (cl_ulong)imageInfo.slicePitch * (cl_ulong)imageInfo.arraySize * 4 * 4;
+                size = slicePitch * imageInfo.arraySize * 4 * 4;
             } while(  size > maxAllocSize || ( size * 3 ) > memSize );
 
+            imageInfo.slicePitch = slicePitch;
+            imageInfo.rowPitch = rowPitch;
+
             if( gDebugTrace )
                 log_info( "   at size %d,%d,%d (pitch %d,%d) out of %d,%d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.arraySize, (int)imageInfo.rowPitch, (int)imageInfo.slicePitch, (int)maxWidth, (int)maxHeight, (int)maxArraySize );
             int ret = test_get_2Dimage_array_info_single(
diff --git a/test_conformance/images/kernel_image_methods/test_3D.cpp b/test_conformance/images/kernel_image_methods/test_3D.cpp
index 287005a511..aae433bd76 100644
--- a/test_conformance/images/kernel_image_methods/test_3D.cpp
+++ b/test_conformance/images/kernel_image_methods/test_3D.cpp
@@ -105,6 +105,9 @@ int test_get_image_info_3D(cl_device_id device, cl_context context,
         for( int i = 0; i < NUM_IMAGE_ITERATIONS; i++ )
         {
             cl_ulong size;
+            cl_ulong slicePitch;
+            cl_ulong rowPitch;
+
             // Loop until we get a size that a) will fit in the max alloc size and b) that an allocation of that
             // image, the result array, plus offset arrays, will fit in the global ram space
             do
@@ -113,23 +116,26 @@ int test_get_image_info_3D(cl_device_id device, cl_context context,
                 imageInfo.height = (size_t)random_log_in_range( 16, (int)maxHeight / 32, seed );
                 imageInfo.depth = (size_t)random_log_in_range( 16, (int)maxDepth / 32, seed );
 
-                imageInfo.rowPitch = imageInfo.width * pixelSize;
-                imageInfo.slicePitch = imageInfo.rowPitch * imageInfo.height;
+                rowPitch = imageInfo.width * pixelSize;
+                slicePitch = imageInfo.rowPitch * imageInfo.height;
 
                 size_t extraWidth = (int)random_log_in_range( 0, 64, seed );
-                imageInfo.rowPitch += extraWidth;
+                rowPitch += extraWidth;
 
                 do {
                     extraWidth++;
-                    imageInfo.rowPitch += extraWidth;
-                } while ((imageInfo.rowPitch % pixelSize) != 0);
+                    rowPitch += extraWidth;
+                } while ((rowPitch % pixelSize) != 0);
 
                 size_t extraHeight = (int)random_log_in_range( 0, 8, seed );
-                imageInfo.slicePitch = imageInfo.rowPitch * (imageInfo.height + extraHeight);
+                slicePitch = rowPitch * (imageInfo.height + extraHeight);
 
-                size = (cl_ulong)imageInfo.slicePitch * (cl_ulong)imageInfo.depth * 4 * 4;
+                size = slicePitch * imageInfo.depth * 4 * 4;
             } while(  size > maxAllocSize || ( size * 3 ) > memSize );
 
+            imageInfo.slicePitch = slicePitch;
+            imageInfo.rowPitch = rowPitch;
+
             if( gDebugTrace )
                 log_info( "   at size %d,%d,%d (pitch %d,%d) out of %d,%d,%d\n", (int)imageInfo.width, (int)imageInfo.height, (int)imageInfo.depth, (int)imageInfo.rowPitch, (int)imageInfo.slicePitch, (int)maxWidth, (int)maxHeight, (int)maxDepth );
             int ret = test_get_image_info_single(context, queue, &imageInfo,

From ee600e89d7c7fd2cb9aa103b59fc42294c43c852 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 11 Mar 2021 09:44:38 +0000
Subject: [PATCH 060/158] Reduce differences by using common names (#1187)

Improve format.

The binary_operator tests are left untouched by this commit as they
require some non-automatic changes.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp        | 42 +++++++--------
 .../math_brute_force/binary_float.cpp         | 33 ++++++------
 .../math_brute_force/binary_i_double.cpp      | 54 +++++++++----------
 .../math_brute_force/binary_i_float.cpp       | 36 ++++++-------
 .../binary_operator_double.cpp                | 43 +++++++--------
 .../binary_operator_float.cpp                 | 32 ++++++-----
 .../binary_two_results_i_double.cpp           | 13 +++--
 .../binary_two_results_i_float.cpp            |  5 +-
 .../math_brute_force/i_unary_double.cpp       | 13 +++--
 .../math_brute_force/i_unary_float.cpp        |  5 +-
 .../math_brute_force/macro_binary_double.cpp  | 42 +++++++--------
 .../math_brute_force/macro_binary_float.cpp   | 32 ++++++-----
 .../math_brute_force/macro_unary_double.cpp   | 21 ++++----
 .../math_brute_force/macro_unary_float.cpp    | 11 ++--
 .../math_brute_force/mad_double.cpp           | 13 +++--
 .../math_brute_force/mad_float.cpp            |  5 +-
 .../math_brute_force/ternary_double.cpp       | 31 ++++++-----
 .../math_brute_force/ternary_float.cpp        | 23 ++++----
 .../math_brute_force/unary_double.cpp         | 21 ++++----
 .../math_brute_force/unary_float.cpp          | 11 ++--
 .../unary_two_results_double.cpp              | 13 +++--
 .../unary_two_results_float.cpp               |  5 +-
 .../unary_two_results_i_double.cpp            | 13 +++--
 .../unary_two_results_i_float.cpp             |  5 +-
 .../math_brute_force/unary_u_double.cpp       | 13 +++--
 .../math_brute_force/unary_u_float.cpp        |  5 +-
 26 files changed, 247 insertions(+), 293 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index cbb186ed04..16c306179f 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -22,9 +22,8 @@
 
 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -120,14 +119,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -170,7 +167,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
+static const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -280,10 +277,10 @@ static const double specialValuesDouble[] = {
     +0.0,
 };
 
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+static size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -411,7 +408,7 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -420,7 +417,7 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -562,7 +559,7 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -608,8 +605,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -618,18 +614,18 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         cl_double *fp2 = (cl_double *)p2;
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            fp[j] = specialValuesDouble[x];
-            fp2[j] = specialValuesDouble[y];
-            if (++x >= specialValuesDoubleCount)
+            fp[j] = specialValues[x];
+            fp2[j] = specialValues[y];
+            if (++x >= specialValuesCount)
             {
                 x = 0;
                 y++;
-                if (y >= specialValuesDoubleCount) break;
+                if (y >= specialValuesCount) break;
             }
         }
     }
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 8dfb9f40fc..fb56c642d0 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -117,8 +117,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -166,7 +165,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
+static const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -268,11 +267,10 @@ static const float specialValuesFloat[] = {
     +0.0f,
 };
 
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
-
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -402,7 +400,7 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -411,7 +409,7 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -552,7 +550,7 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -613,8 +611,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -623,19 +620,19 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         float *fp2 = (float *)p2;
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            fp[j] = specialValuesFloat[x];
-            fp2[j] = specialValuesFloat[y];
+            fp[j] = specialValues[x];
+            fp2[j] = specialValues[y];
             ++x;
-            if (x >= specialValuesFloatCount)
+            if (x >= specialValuesCount)
             {
                 x = 0;
                 y++;
-                if (y >= specialValuesFloatCount) break;
+                if (y >= specialValuesCount) break;
             }
         }
     }
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index eb2c59e029..74d0819c46 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -21,9 +21,8 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -119,14 +118,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -165,7 +162,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
+static const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -275,17 +272,17 @@ static const double specialValuesDouble[] = {
     +0.0,
 };
 
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+static size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
-static const int specialValuesInt2[] = { 0,       1,     2,      3,
-                                         1022,    1023,  1024,   INT_MIN,
-                                         INT_MAX, -1,    -2,     -3,
-                                         -1022,   -1023, -11024, -INT_MAX };
-static constexpr size_t specialValuesInt2Count =
-    sizeof(specialValuesInt2) / sizeof(specialValuesInt2[0]);
+static const int specialValuesInt[] = {
+    0,       1,  2,  3,  1022,  1023,  1024,   INT_MIN,
+    INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX,
+};
+static constexpr size_t specialValuesIntCount =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -412,7 +409,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -421,7 +418,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -564,7 +561,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -608,8 +605,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesInt2Count;
+    int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -618,18 +614,18 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         cl_int *ip2 = (cl_int *)p2;
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            fp[j] = specialValuesDouble[x];
-            ip2[j] = specialValuesInt2[y];
-            if (++x >= specialValuesDoubleCount)
+            fp[j] = specialValues[x];
+            ip2[j] = specialValuesInt[y];
+            if (++x >= specialValuesCount)
             {
                 x = 0;
                 y++;
-                if (y >= specialValuesInt2Count) break;
+                if (y >= specialValuesIntCount) break;
             }
         }
     }
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index 019c96a5bf..6b1289376a 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -116,8 +116,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -161,7 +160,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
+static const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -263,19 +262,19 @@ static const float specialValuesFloat[] = {
     +0.0f,
 };
 
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
 static const int specialValuesInt[] = {
-    0,           1,           2,          3,          126,        127,
-    128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
-    -2,          -3,          -126,       -127,       -128,       -0x02000001,
-    -0x04000001, -1465264071, -1488522147
+    0,           1,           2,           3,          126,        127,
+    128,         0x02000001,  0x04000001,  1465264071, 1488522147, -1,
+    -2,          -3,          -126,        -127,       -128,       -0x02000001,
+    -0x04000001, -1465264071, -1488522147,
 };
 static size_t specialValuesIntCount =
     sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -403,7 +402,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -412,7 +411,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -553,7 +552,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -596,8 +595,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesIntCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -606,15 +604,15 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         cl_int *ip2 = (cl_int *)p2;
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            fp[j] = specialValuesFloat[x];
+            fp[j] = specialValues[x];
             ip2[j] = specialValuesInt[y];
             ++x;
-            if (x >= specialValuesFloatCount)
+            if (x >= specialValuesCount)
             {
                 x = 0;
                 y++;
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index a7fb3cd0d3..a762e3e955 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -20,9 +20,9 @@
 
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, const char *operator_symbol,
-                             int vectorSize, cl_uint kernel_count, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, const char *operator_symbol,
+                       int vectorSize, cl_uint kernel_count, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void ",
@@ -123,14 +123,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->name, info->operator_symbol, i,
-                             info->kernel_count, info->kernels[i],
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->name, info->operator_symbol, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -171,7 +169,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
+static const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -281,10 +279,10 @@ static const double specialValuesDouble[] = {
     +0.0,
 };
 
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                                            bool relaxedMode)
@@ -412,7 +410,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                                        f->name,
                                        f->nameInCode,
                                        relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -421,7 +419,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -563,7 +561,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     return error;
 }
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -608,8 +606,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -618,18 +615,18 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         cl_double *fp2 = (cl_double *)p2;
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            fp[j] = specialValuesDouble[x];
-            fp2[j] = specialValuesDouble[y];
-            if (++x >= specialValuesDoubleCount)
+            fp[j] = specialValues[x];
+            fp2[j] = specialValues[y];
+            if (++x >= specialValuesCount)
             {
                 x = 0;
                 y++;
-                if (y >= specialValuesDoubleCount) break;
+                if (y >= specialValuesCount) break;
             }
         }
     }
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index a9d3b7cf25..5ec6b5d38d 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -121,8 +121,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -168,7 +167,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
+static const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -270,10 +269,10 @@ static const float specialValuesFloat[] = {
     +0.0f,
 };
 
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                                         bool relaxedMode)
@@ -403,7 +402,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                                        f->name,
                                        f->nameInCode,
                                        relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -412,7 +411,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -553,7 +552,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     return error;
 }
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -604,8 +603,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -613,19 +611,19 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         // Insert special values
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            p[j] = ((cl_uint *)specialValuesFloat)[x];
-            p2[j] = ((cl_uint *)specialValuesFloat)[y];
+            p[j] = ((cl_uint *)specialValues)[x];
+            p2[j] = ((cl_uint *)specialValues)[y];
             ++x;
-            if (x >= specialValuesFloatCount)
+            if (x >= specialValuesCount)
             {
                 x = 0;
                 y++;
-                if (y >= specialValuesFloatCount) break;
+                if (y >= specialValuesCount) break;
             }
             if (relaxedMode && strcmp(name, "divide") == 0)
             {
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index 7f22af2a1c..71e11fc638 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -21,8 +21,8 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -124,13 +124,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
 }
 
 typedef struct ComputeReferenceInfoD_
@@ -194,7 +193,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index 1daf2b99ea..9db5c1c1c2 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -122,8 +122,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -195,7 +194,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index f52fc13666..51243ffa00 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -20,8 +20,8 @@
 
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -109,13 +109,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
 }
 
 int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
@@ -143,7 +142,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index 6b4a0f8c25..d1fb867df4 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -107,8 +107,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -141,7 +140,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 2ea785e62e..02c042b9ee 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -20,9 +20,8 @@
 
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -118,14 +117,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -157,7 +154,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
+static const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -267,10 +264,10 @@ static const double specialValuesDouble[] = {
     +0.0,
 };
 
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -390,7 +387,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -399,7 +396,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         if (error) goto exit;
 
@@ -530,7 +527,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -573,26 +570,25 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     double *p = (double *)gIn + thread_id * buffer_elements;
     double *p2 = (double *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
     { // test edge cases
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            p[j] = specialValuesDouble[x];
-            p2[j] = specialValuesDouble[y];
-            if (++x >= specialValuesDoubleCount)
+            p[j] = specialValues[x];
+            p2[j] = specialValues[y];
+            if (++x >= specialValuesCount)
             {
                 x = 0;
                 y++;
-                if (y >= specialValuesDoubleCount) break;
+                if (y >= specialValuesCount) break;
             }
         }
     }
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index a61ab6b392..8f2e41fcb4 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -115,8 +115,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -153,7 +152,7 @@ typedef struct TestInfo
 } TestInfo;
 
 // A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
+static const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -255,10 +254,10 @@ static const float specialValuesFloat[] = {
     +0.0f,
 };
 
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -379,7 +378,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -388,7 +387,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         if (error) goto exit;
 
@@ -516,7 +515,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -558,8 +557,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -568,19 +566,19 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         float *fp2 = (float *)p2;
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesCount;
+        y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; j < buffer_elements; j++)
         {
-            fp[j] = specialValuesFloat[x];
-            fp2[j] = specialValuesFloat[y];
+            fp[j] = specialValues[x];
+            fp2[j] = specialValues[y];
             ++x;
-            if (x >= specialValuesFloatCount)
+            if (x >= specialValuesCount)
             {
                 x = 0;
                 y++;
-                if (y >= specialValuesFloatCount) break;
+                if (y >= specialValuesCount) break;
             }
         }
     }
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 6f1e27184f..2df22dcce9 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -20,9 +20,8 @@
 
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -112,14 +111,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -148,7 +145,7 @@ typedef struct TestInfo
 
 } TestInfo;
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -256,7 +253,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -265,7 +262,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         if (error) goto exit;
 
@@ -375,7 +372,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 86000a36a4..c44c17750c 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -110,8 +110,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -145,7 +144,7 @@ typedef struct TestInfo
 
 } TestInfo;
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -254,7 +253,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -263,7 +262,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         if (error) goto exit;
 
@@ -372,7 +371,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index 1393b1defd..8502ceb257 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -20,8 +20,8 @@
 
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -122,13 +122,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
 }
 
 int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
@@ -151,7 +150,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
index eaab29fb6c..453e43e79b 100644
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -120,8 +120,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -150,7 +149,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index dc4bd8c62a..21c00f5131 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -23,8 +23,8 @@
 #define CORRECTLY_ROUNDED 0
 #define FLUSHED 1
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -125,17 +125,16 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
 }
 
 // A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
+static const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -203,8 +202,8 @@ static const double specialValuesDouble[] = {
     +0.0,
 };
 
-static const size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                                          bool relaxedMode)
@@ -230,7 +229,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
@@ -249,16 +248,16 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             x = y = z = 0;
             for (; j < bufferSize / sizeof(double); j++)
             {
-                p[j] = specialValuesDouble[x];
-                p2[j] = specialValuesDouble[y];
-                p3[j] = specialValuesDouble[z];
-                if (++x >= specialValuesDoubleCount)
+                p[j] = specialValues[x];
+                p2[j] = specialValues[y];
+                p3[j] = specialValues[z];
+                if (++x >= specialValuesCount)
                 {
                     x = 0;
-                    if (++y >= specialValuesDoubleCount)
+                    if (++y >= specialValuesCount)
                     {
                         y = 0;
-                        if (++z >= specialValuesDoubleCount) break;
+                        if (++z >= specialValuesCount) break;
                     }
                 }
             }
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index 1fcdc4a2c4..20b3eb7127 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -123,8 +123,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -133,7 +132,7 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
 }
 
 // A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
+static const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -211,8 +210,8 @@ static const float specialValuesFloat[] = {
     +0.0f,
 };
 
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+static const size_t specialValuesCount =
+    sizeof(specialValues) / sizeof(specialValues[0]);
 
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -246,7 +245,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
@@ -268,17 +267,17 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             x = y = z = 0;
             for (; j < bufferSize / sizeof(float); j++)
             {
-                fp[j] = specialValuesFloat[x];
-                fp2[j] = specialValuesFloat[y];
-                fp3[j] = specialValuesFloat[z];
+                fp[j] = specialValues[x];
+                fp2[j] = specialValues[y];
+                fp3[j] = specialValues[z];
 
-                if (++x >= specialValuesFloatCount)
+                if (++x >= specialValuesCount)
                 {
                     x = 0;
-                    if (++y >= specialValuesFloatCount)
+                    if (++y >= specialValuesCount)
                     {
                         y = 0;
-                        if (++z >= specialValuesFloatCount) break;
+                        if (++z >= specialValuesCount) break;
                     }
                 }
             }
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index 99959ae30f..9ba457c735 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -24,9 +24,8 @@
 #include <sys/time.h>
 #endif
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -116,14 +115,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -160,7 +157,7 @@ typedef struct TestInfo
                       // otherwise.
 } TestInfo;
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -276,7 +273,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -285,7 +282,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -424,7 +421,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 52ba0eb521..ee250f34e1 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -113,8 +113,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -156,7 +155,7 @@ typedef struct TestInfo
                       // otherwise.
 } TestInfo;
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
@@ -287,7 +286,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             goto exit;
@@ -296,7 +295,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // Run the kernels
     if (!gSkipCorrectnessTesting || skipTestingRelaxed)
     {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
         for (i = 0; i < test_info.threadCount; i++)
@@ -430,7 +429,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     return error;
 }
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index e2231a5830..d42b7ce600 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -20,8 +20,8 @@
 
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -116,13 +116,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
 }
 
 int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
@@ -149,7 +148,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index d85d60f0d0..70b24c7a94 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -114,8 +114,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -151,7 +150,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 4bf017ce56..44eab122f4 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -21,8 +21,8 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -117,13 +117,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
 }
 
 static cl_ulong abs_cl_long(cl_long i)
@@ -157,7 +156,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index 9bf297edce..1312be01bc 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -115,8 +115,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -161,7 +160,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index e0641b6ae6..80acafb348 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -20,8 +20,8 @@
 
 #include <cstring>
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -111,13 +111,12 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
 }
 
 static cl_ulong random64(MTdata d)
@@ -146,7 +145,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 2ff575265e..4b453c871c 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -108,8 +108,7 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
+static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -145,7 +144,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+        if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
             return error;

From da2a1bd4a523e4faeabcd662b96036b075c4a6b6 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 17 Mar 2021 09:50:11 +0000
Subject: [PATCH 061/158] Remove trivially dead code (#1190)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary_double.cpp              | 1 -
 test_conformance/math_brute_force/binary_i_double.cpp            | 1 -
 test_conformance/math_brute_force/binary_operator_double.cpp     | 1 -
 .../math_brute_force/binary_two_results_i_double.cpp             | 1 -
 test_conformance/math_brute_force/i_unary_double.cpp             | 1 -
 test_conformance/math_brute_force/macro_binary_double.cpp        | 1 -
 test_conformance/math_brute_force/macro_unary_double.cpp         | 1 -
 test_conformance/math_brute_force/mad_double.cpp                 | 1 -
 test_conformance/math_brute_force/ternary_double.cpp             | 1 -
 test_conformance/math_brute_force/unary_double.cpp               | 1 -
 test_conformance/math_brute_force/unary_two_results_double.cpp   | 1 -
 test_conformance/math_brute_force/unary_two_results_i_double.cpp | 1 -
 test_conformance/math_brute_force/unary_u_double.cpp             | 1 -
 13 files changed, 13 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 16c306179f..315c6c3393 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -521,7 +521,6 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 74d0819c46..229c8f9590 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -523,7 +523,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index a762e3e955..046f1e783b 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -523,7 +523,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index 71e11fc638..f1a27ee36e 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -651,7 +651,6 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index 51243ffa00..c3822f366b 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -354,7 +354,6 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     vlog("\n");
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 02c042b9ee..589dc9bbea 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -491,7 +491,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     vlog("\n");
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 2df22dcce9..06cae25bff 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -338,7 +338,6 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     vlog("\n");
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index 8502ceb257..5e7dba9879 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -385,7 +385,6 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 21c00f5131..96a0e7bbb8 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -821,7 +821,6 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index 9ba457c735..b8f85fb855 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -382,7 +382,6 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index d42b7ce600..77e66ca529 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -502,7 +502,6 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 44eab122f4..150e5d8f7b 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -475,7 +475,6 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting)
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index 80acafb348..d104fc88cf 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -366,7 +366,6 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
             vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
     if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);

From 111bb2b185bb912816016348d4403a6b9921924a Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 17 Mar 2021 09:50:30 +0000
Subject: [PATCH 062/158] Fix discrepancy in logging messages (#1189)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary_double.cpp         | 6 +++---
 test_conformance/math_brute_force/binary_i_double.cpp       | 6 +++---
 test_conformance/math_brute_force/binary_i_float.cpp        | 4 ++--
 .../math_brute_force/binary_operator_double.cpp             | 6 +++---
 test_conformance/math_brute_force/binary_operator_float.cpp | 4 ++--
 test_conformance/math_brute_force/macro_binary_double.cpp   | 6 +++---
 test_conformance/math_brute_force/macro_binary_float.cpp    | 4 ++--
 test_conformance/math_brute_force/macro_unary_double.cpp    | 6 +++---
 test_conformance/math_brute_force/macro_unary_float.cpp     | 4 ++--
 test_conformance/math_brute_force/unary_double.cpp          | 6 +++---
 test_conformance/math_brute_force/unary_float.cpp           | 4 ++--
 11 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 315c6c3393..15a48a6664 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -385,9 +385,9 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 229c8f9590..686e32b405 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -386,9 +386,9 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index 6b1289376a..d99382f1be 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -380,8 +380,8 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 046f1e783b..cdb4663629 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -384,9 +384,9 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index 5ec6b5d38d..eba3ee6650 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -377,8 +377,8 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 589dc9bbea..cbcecb3345 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -364,9 +364,9 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index 8f2e41fcb4..bbccbbe9b5 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -356,8 +356,8 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 06cae25bff..154d8ecb94 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -232,9 +232,9 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index c44c17750c..725a83168a 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -233,8 +233,8 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index b8f85fb855..1074e9cafd 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -252,9 +252,9 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index ee250f34e1..126db73eaa 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -248,8 +248,8 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }

From 6b36f645b835f8054422cad3f050d18935be11be Mon Sep 17 00:00:00 2001
From: Zakaria Taha <45341452+zakaria6868@users.noreply.github.com>
Date: Thu, 18 Mar 2021 16:27:59 +0200
Subject: [PATCH 063/158] Add tests to proposed new builtin async_copy
 functions with a bug fix. (#725)

* Add tests to proposed new builtin async_copy functions with a bug fix.

* Revert "Add tests to proposed new builtin async_copy functions with a bug fix."

This reverts commit 7d0f16d014d228c327daf27464b27e02267f9aef.

* Add tests to proposed new builtin async_copy functions.

* Added is_extension_available to check if an extension is available.

* Added is extension available for test_async_copy_fence.

* fix build issues on windows.

* include algorithms.h for async copy 2D/3D.

* adding algorithms header.

* Fix numLines - 1 in maxTotalPlanesIn/Out.

* fix formatting violations.

* fixed formatting issue.
---
 test_conformance/basic/CMakeLists.txt         |   3 +
 test_conformance/basic/main.cpp               |  14 +-
 test_conformance/basic/procs.h                |  36 +
 test_conformance/basic/test_async_copy2D.cpp  | 449 ++++++++++
 test_conformance/basic/test_async_copy3D.cpp  | 546 ++++++++++++
 .../basic/test_async_copy_fence.cpp           | 812 ++++++++++++++++++
 6 files changed, 1858 insertions(+), 2 deletions(-)
 create mode 100644 test_conformance/basic/test_async_copy2D.cpp
 create mode 100644 test_conformance/basic/test_async_copy3D.cpp
 create mode 100644 test_conformance/basic/test_async_copy_fence.cpp

diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt
index 27178246bb..c5c4b5f0cc 100644
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -37,6 +37,9 @@ set(${MODULE_NAME}_SOURCES
     test_work_item_functions.cpp
     test_astype.cpp
     test_async_copy.cpp
+    test_async_copy2D.cpp
+    test_async_copy3D.cpp
+    test_async_copy_fence.cpp	
     test_sizeof.cpp
     test_vector_creation.cpp
     test_vector_swizzle.cpp
diff --git a/test_conformance/basic/main.cpp b/test_conformance/basic/main.cpp
index d1a35faec7..86c3cec359 100644
--- a/test_conformance/basic/main.cpp
+++ b/test_conformance/basic/main.cpp
@@ -113,14 +113,24 @@ test_definition test_list[] = {
     ADD_TEST(async_copy_local_to_global),
     ADD_TEST(async_strided_copy_global_to_local),
     ADD_TEST(async_strided_copy_local_to_global),
+    ADD_TEST(async_copy_global_to_local2D),
+    ADD_TEST(async_copy_local_to_global2D),
+    ADD_TEST(async_copy_global_to_local3D),
+    ADD_TEST(async_copy_local_to_global3D),
+    ADD_TEST(async_work_group_copy_fence_import_after_export_aliased_local),
+    ADD_TEST(async_work_group_copy_fence_import_after_export_aliased_global),
+    ADD_TEST(
+        async_work_group_copy_fence_import_after_export_aliased_global_and_local),
+    ADD_TEST(async_work_group_copy_fence_export_after_import_aliased_local),
+    ADD_TEST(async_work_group_copy_fence_export_after_import_aliased_global),
+    ADD_TEST(
+        async_work_group_copy_fence_export_after_import_aliased_global_and_local),
     ADD_TEST(prefetch),
-
     ADD_TEST(kernel_call_kernel_function),
     ADD_TEST(host_numeric_constants),
     ADD_TEST(kernel_numeric_constants),
     ADD_TEST(kernel_limit_constants),
     ADD_TEST(kernel_preprocessor_macros),
-
     ADD_TEST(parameter_types),
     ADD_TEST(vector_creation),
     ADD_TEST(vector_swizzle),
diff --git a/test_conformance/basic/procs.h b/test_conformance/basic/procs.h
index bdb7d6a430..4a01a8cbb0 100644
--- a/test_conformance/basic/procs.h
+++ b/test_conformance/basic/procs.h
@@ -115,6 +115,42 @@ extern int      test_async_copy_global_to_local(cl_device_id deviceID, cl_contex
 extern int      test_async_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_async_strided_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_async_strided_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_async_copy_global_to_local2D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_copy_local_to_global2D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_copy_global_to_local3D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_copy_local_to_global3D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_work_group_copy_fence_import_after_export_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_async_work_group_copy_fence_import_after_export_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int
+test_async_work_group_copy_fence_import_after_export_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_async_work_group_copy_fence_export_after_import_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_async_work_group_copy_fence_export_after_import_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int
+test_async_work_group_copy_fence_export_after_import_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
 extern int      test_prefetch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 
 extern int      test_host_numeric_constants(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp
new file mode 100644
index 0000000000..2b534497fa
--- /dev/null
+++ b/test_conformance/basic/test_async_copy2D.cpp
@@ -0,0 +1,449 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "../../test_common/harness/conversions.h"
+#include "procs.h"
+
+static const char *async_global_to_local_kernel2D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
+    "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
+    "{\n"
+    " int i, j;\n"
+    // Zero the local storage first
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
+    "(%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_2D2D( (__local %s*)localBuffer, "
+    "(__global const "
+    "%s*)(src+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
+    "srcStride)), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, "
+    "srcStride, dstStride, 0 );\n"
+    // Wait for the copy to complete, then verify by manually copying to the
+    // dest
+    "     wait_group_events( 1, &event );\n"
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     dst[ (get_global_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
+    "localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ];\n"
+    "}\n";
+
+static const char *async_local_to_global_kernel2D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
+    "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
+    "{\n"
+    " int i, j;\n"
+    // Zero the local storage first
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = "
+    "(%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = src[ "
+    "(get_global_id( 0 )*lineCopiesPerWorkItem+i)*(numElementsPerLine + "
+    "srcStride)+j ];\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_2D2D((__global "
+    "%s*)(dst+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
+    "dstStride)), (__local const %s*)localBuffer, (size_t)numElementsPerLine, "
+    "(size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0 );\n"
+    "    wait_group_events( 1, &event );\n"
+    "}\n";
+
+int test_copy2D(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, const char *kernelCode,
+                ExplicitType vecType, int vecSize, int srcStride, int dstStride,
+                bool localIsDst)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[2];
+    size_t threads[1], localThreads[1];
+    void *inBuffer, *outBuffer, *outBufferCopy;
+    MTdata d;
+    char vecNameString[64];
+    vecNameString[0] = 0;
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
+                vecSize);
+
+    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
+    log_info("Testing %s with srcStride = %d, dstStride = %d\n", vecNameString,
+             srcStride, dstStride);
+
+    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
+    {
+        log_info(
+            "Device does not support extended async copies. Skipping test.\n");
+        return 0;
+    }
+
+    cl_long max_local_mem_size;
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    cl_long max_global_mem_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(max_global_mem_size), &max_global_mem_size,
+                            NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
+
+    cl_long max_alloc_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(max_alloc_size), &max_alloc_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
+
+    if (max_alloc_size > max_global_mem_size / 2)
+        max_alloc_size = max_global_mem_size / 2;
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(num_of_compute_devices),
+                            &num_of_compute_devices, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096];
+    programSource[0] = 0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
+                               : "",
+            vecNameString, vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString);
+    // log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&programPtr, "test_fn");
+    test_error(error, "Unable to create testing kernel");
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(
+        kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
+        &max_workgroup_size, NULL);
+    test_error(
+        error,
+        "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(max_local_workgroup_size),
+                            max_local_workgroup_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t numElementsPerLine = 10;
+    size_t lineCopiesPerWorkItem = 13;
+    elementSize =
+        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
+    size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize
+        * (numElementsPerLine + (localIsDst ? dstStride : srcStride));
+    size_t maxLocalWorkgroupSize =
+        (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
+
+    // Calculation can return 0 on embedded devices due to 1KB local mem limit
+    if (maxLocalWorkgroupSize == 0)
+    {
+        maxLocalWorkgroupSize = 1;
+    }
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride)
+        / (numElementsPerLine + srcStride);
+    size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride)
+        / (numElementsPerLine + dstStride);
+    size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut);
+    size_t maxLocalWorkgroups =
+        maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
+
+    size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
+        - (localIsDst ? dstStride : srcStride);
+    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
+    size_t totalLines =
+        numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
+    size_t inBufferSize = elementSize
+        * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride);
+    size_t outBufferSize = elementSize
+        * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride);
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+
+    inBuffer = (void *)malloc(inBufferSize);
+    outBuffer = (void *)malloc(outBufferSize);
+    outBufferCopy = (void *)malloc(outBufferSize);
+
+    cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt,
+        lineCopiesPerWorkgroup;
+    lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem;
+    numElementsPerLineInt = (int)numElementsPerLine;
+    lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize);
+
+    log_info(
+        "Global: %d, local %d, local buffer %db, global in buffer %db, "
+        "global out buffer %db, each work group will copy %d lines and each "
+        "work item item will copy %d lines.\n",
+        (int)globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize,
+        (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup,
+        lineCopiesPerWorkItemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand(gRandomSeed);
+    generate_random_data(
+        vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
+    generate_random_data(
+        vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
+    free_mtdata(d);
+    d = NULL;
+    memcpy(outBufferCopy, outBuffer, outBufferSize);
+
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
+                                inBuffer, &error);
+    test_error(error, "Unable to create input buffer");
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
+                                outBuffer, &error);
+    test_error(error, "Unable to create output buffer");
+
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
+                           &numElementsPerLineInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 4, sizeof(lineCopiesPerWorkgroup),
+                           &lineCopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 5, sizeof(lineCopiesPerWorkItemInt),
+                           &lineCopiesPerWorkItemInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 6, sizeof(srcStride), &srcStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 7, sizeof(dstStride), &dstStride);
+    test_error(error, "Unable to set kernel argument");
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to queue kernel");
+
+    // Read
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
+                                outBuffer, 0, NULL, NULL);
+    test_error(error, "Unable to read results");
+
+    // Verify
+    int failuresPrinted = 0;
+    // Verify
+    size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+    for (int i = 0;
+         i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize;
+         i += elementSize)
+    {
+        for (int j = 0; j < (int)numElementsPerLine * elementSize;
+             j += elementSize)
+        {
+            int inIdx = i * (numElementsPerLine + srcStride) + j;
+            int outIdx = i * (numElementsPerLine + dstStride) + j;
+            if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
+                       typeSize)
+                != 0)
+            {
+                unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
+                unsigned char *outchar = (unsigned char *)outBuffer + outIdx;
+                char values[4096];
+                values[0] = 0;
+
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of copy did not validate!\n");
+                }
+                sprintf(values + strlen(values), "%d -> [", inIdx);
+                for (int k = 0; k < (int)elementSize; k++)
+                    sprintf(values + strlen(values), "%2x ", inchar[k]);
+                sprintf(values + strlen(values), "] != [");
+                for (int k = 0; k < (int)elementSize; k++)
+                    sprintf(values + strlen(values), "%2x ", outchar[k]);
+                sprintf(values + strlen(values), "]");
+                log_error("%s\n", values);
+                failuresPrinted++;
+            }
+
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                return -1;
+            }
+        }
+        if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1)
+                * elementSize)
+        {
+            int outIdx = i * (numElementsPerLine + dstStride)
+                + numElementsPerLine * elementSize;
+            if (memcmp(((char *)outBuffer) + outIdx,
+                       ((char *)outBufferCopy) + outIdx,
+                       dstStride * elementSize)
+                != 0)
+            {
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of copy did not validate!\n");
+                }
+                log_error(
+                    "2D copy corrupted data in output buffer in the stride "
+                    "offset of line %d\n",
+                    i);
+                failuresPrinted++;
+            }
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                return -1;
+            }
+        }
+    }
+
+    free(inBuffer);
+    free(outBuffer);
+    free(outBufferCopy);
+
+    return failuresPrinted ? -1 : 0;
+}
+
+int test_copy2D_all_types(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, const char *kernelCode,
+                          bool localIsDst)
+{
+    ExplicitType vecType[] = {
+        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
+        kULong, kFloat, kDouble, kNumExplicitTypes
+    };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
+    unsigned int size, typeIndex, srcStride, dstStride;
+
+    int errors = 0;
+
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    {
+        if (vecType[typeIndex] == kDouble
+            && !is_extension_available(deviceID, "cl_khr_fp64"))
+            continue;
+
+        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
+            && !gHasLong)
+            continue;
+
+        for (size = 0; vecSizes[size] != 0; size++)
+        {
+            if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
+                <= 2) // small type
+            {
+                for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes)
+                         / sizeof(smallTypesStrideSizes[0]);
+                     srcStride++)
+                {
+                    for (dstStride = 0;
+                         dstStride < sizeof(smallTypesStrideSizes)
+                             / sizeof(smallTypesStrideSizes[0]);
+                         dstStride++)
+                    {
+                        if (test_copy2D(deviceID, context, queue, kernelCode,
+                                        vecType[typeIndex], vecSizes[size],
+                                        smallTypesStrideSizes[srcStride],
+                                        smallTypesStrideSizes[dstStride],
+                                        localIsDst))
+                        {
+                            errors++;
+                        }
+                    }
+                }
+            }
+            // not a small type, check only zero stride
+            else if (test_copy2D(deviceID, context, queue, kernelCode,
+                                 vecType[typeIndex], vecSizes[size], 0, 0,
+                                 localIsDst))
+            {
+                errors++;
+            }
+        }
+    }
+    if (errors) return -1;
+    return 0;
+}
+
+int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy2D_all_types(deviceID, context, queue,
+                                 async_global_to_local_kernel2D, true);
+}
+
+int test_async_copy_local_to_global2D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy2D_all_types(deviceID, context, queue,
+                                 async_local_to_global_kernel2D, false);
+}
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp
new file mode 100644
index 0000000000..af10191fee
--- /dev/null
+++ b/test_conformance/basic/test_async_copy3D.cpp
@@ -0,0 +1,546 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "../../test_common/harness/conversions.h"
+#include "procs.h"
+
+static const char *async_global_to_local_kernel3D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int numLines, int "
+    "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
+    "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
+    "{\n"
+    " int i, j, k;\n"
+    // Zero the local storage first
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
+    "dstLineStride) + k ] = (%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_3D3D( (__local %s*)localBuffer, "
+    "(__global const "
+    "%s*)(src+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
+    "numElementsPerLine + numLines*srcLineStride + srcPlaneStride)), "
+    "(size_t)numElementsPerLine, (size_t)numLines, srcLineStride, "
+    "dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, dstPlaneStride, "
+    "0 );\n"
+    // Wait for the copy to complete, then verify by manually copying to the
+    // dest
+    " wait_group_events( 1, &event );\n"
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       dst[ (get_global_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
+    "dstLineStride) + k ] = localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
+    "dstLineStride) + k ];\n"
+    "}\n";
+
+static const char *async_local_to_global_kernel3D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int numLines, int "
+    "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
+    "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
+    "{\n"
+    " int i, j, k;\n"
+    // Zero the local storage first
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
+    "srcLineStride) + k ] = (%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
+    "srcLineStride) + k ] = src[ (get_global_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
+    "srcLineStride) + k ];\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_3D3D((__global "
+    "%s*)(dst+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
+    "numElementsPerLine + numLines*dstLineStride + dstPlaneStride)), (__local "
+    "const %s*)localBuffer, (size_t)numElementsPerLine, (size_t)numLines, "
+    "srcLineStride, dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, "
+    "dstPlaneStride, 0 );\n"
+    "    wait_group_events( 1, &event );\n"
+    "}\n";
+
+int test_copy3D(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, const char *kernelCode,
+                ExplicitType vecType, int vecSize, int srcLineStride,
+                int dstLineStride, int srcPlaneStride, int dstPlaneStride,
+                bool localIsDst)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[2];
+    size_t threads[1], localThreads[1];
+    void *inBuffer, *outBuffer, *outBufferCopy;
+    MTdata d;
+    char vecNameString[64];
+    vecNameString[0] = 0;
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
+                vecSize);
+
+    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
+    log_info("Testing %s with srcLineStride = %d, dstLineStride = %d, "
+             "srcPlaneStride = %d, dstPlaneStride = %d\n",
+             vecNameString, srcLineStride, dstLineStride, srcPlaneStride,
+             dstPlaneStride);
+
+    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
+    {
+        log_info(
+            "Device does not support extended async copies. Skipping test.\n");
+        return 0;
+    }
+
+    cl_long max_local_mem_size;
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    cl_long max_global_mem_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(max_global_mem_size), &max_global_mem_size,
+                            NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
+
+    cl_long max_alloc_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(max_alloc_size), &max_alloc_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
+
+    if (max_alloc_size > max_global_mem_size / 2)
+        max_alloc_size = max_global_mem_size / 2;
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(num_of_compute_devices),
+                            &num_of_compute_devices, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096];
+    programSource[0] = 0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
+                               : "",
+            vecNameString, vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString);
+    // log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&programPtr, "test_fn");
+    test_error(error, "Unable to create testing kernel");
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(
+        kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
+        &max_workgroup_size, NULL);
+    test_error(
+        error,
+        "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(max_local_workgroup_size),
+                            max_local_workgroup_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t numElementsPerLine = 10;
+    size_t numLines = 13;
+    size_t planesCopiesPerWorkItem = 2;
+    elementSize =
+        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
+    size_t localStorageSpacePerWorkitem = elementSize
+        * (planesCopiesPerWorkItem
+           * (numLines * numElementsPerLine
+              + numLines * (localIsDst ? dstLineStride : srcLineStride)
+              + (localIsDst ? dstPlaneStride : srcPlaneStride)));
+    size_t maxLocalWorkgroupSize =
+        (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
+
+    // Calculation can return 0 on embedded devices due to 1KB local mem limit
+    if (maxLocalWorkgroupSize == 0)
+    {
+        maxLocalWorkgroupSize = 1;
+    }
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride)
+        / ((numLines * numElementsPerLine + numLines * srcLineStride)
+           + srcPlaneStride);
+    size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride)
+        / ((numLines * numElementsPerLine + numLines * dstLineStride)
+           + dstPlaneStride);
+    size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut);
+    size_t maxLocalWorkgroups =
+        maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
+
+    size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
+        - (localIsDst ? dstPlaneStride : srcPlaneStride);
+    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
+    size_t totalPlanes =
+        numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
+    size_t inBufferSize = elementSize
+        * (totalPlanes
+               * (numLines * numElementsPerLine + numLines * srcLineStride)
+           + (totalPlanes - 1) * srcPlaneStride);
+    size_t outBufferSize = elementSize
+        * (totalPlanes
+               * (numLines * numElementsPerLine + numLines * dstLineStride)
+           + (totalPlanes - 1) * dstPlaneStride);
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+
+    inBuffer = (void *)malloc(inBufferSize);
+    outBuffer = (void *)malloc(outBufferSize);
+    outBufferCopy = (void *)malloc(outBufferSize);
+
+    cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt,
+        planesCopiesPerWorkgroup;
+    planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem;
+    numElementsPerLineInt = (int)numElementsPerLine;
+    numLinesInt = (int)numLines;
+    planesCopiesPerWorkgroup =
+        (int)(planesCopiesPerWorkItem * localWorkgroupSize);
+
+    log_info("Global: %d, local %d, local buffer %db, global in buffer %db, "
+             "global out buffer %db, each work group will copy %d planes and "
+             "each work item item will copy %d planes.\n",
+             (int)globalWorkgroupSize, (int)localWorkgroupSize,
+             (int)localBufferSize, (int)inBufferSize, (int)outBufferSize,
+             planesCopiesPerWorkgroup, planesCopiesPerWorkItemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand(gRandomSeed);
+    generate_random_data(
+        vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
+    generate_random_data(
+        vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
+    free_mtdata(d);
+    d = NULL;
+    memcpy(outBufferCopy, outBuffer, outBufferSize);
+
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
+                                inBuffer, &error);
+    test_error(error, "Unable to create input buffer");
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
+                                outBuffer, &error);
+    test_error(error, "Unable to create output buffer");
+
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
+                           &numElementsPerLineInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 4, sizeof(numLinesInt), &numLinesInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 5, sizeof(planesCopiesPerWorkgroup),
+                           &planesCopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 6, sizeof(planesCopiesPerWorkItemInt),
+                           &planesCopiesPerWorkItemInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 7, sizeof(srcLineStride), &srcLineStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 8, sizeof(dstLineStride), &dstLineStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 9, sizeof(srcPlaneStride), &srcPlaneStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 10, sizeof(dstPlaneStride), &dstPlaneStride);
+    test_error(error, "Unable to set kernel argument");
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to queue kernel");
+
+    // Read
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
+                                outBuffer, 0, NULL, NULL);
+    test_error(error, "Unable to read results");
+
+    // Verify
+    int failuresPrinted = 0;
+    // Verify
+    size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+    for (int i = 0;
+         i < (int)globalWorkgroupSize * planesCopiesPerWorkItem * elementSize;
+         i += elementSize)
+    {
+        for (int j = 0; j < (int)numLines * elementSize; j += elementSize)
+        {
+            for (int k = 0; k < (int)numElementsPerLine * elementSize;
+                 k += elementSize)
+            {
+                int inIdx = i
+                        * (numLines * numElementsPerLine
+                           + numLines * srcLineStride + srcPlaneStride)
+                    + j * (numElementsPerLine + srcLineStride) + k;
+                int outIdx = i
+                        * (numLines * numElementsPerLine
+                           + numLines * dstLineStride + dstPlaneStride)
+                    + j * (numElementsPerLine + dstLineStride) + k;
+                if (memcmp(((char *)inBuffer) + inIdx,
+                           ((char *)outBuffer) + outIdx, typeSize)
+                    != 0)
+                {
+                    unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
+                    unsigned char *outchar =
+                        (unsigned char *)outBuffer + outIdx;
+                    char values[4096];
+                    values[0] = 0;
+
+                    if (failuresPrinted == 0)
+                    {
+                        // Print first failure message
+                        log_error("ERROR: Results of copy did not validate!");
+                    }
+                    sprintf(values + strlen(values), "%d -> [", inIdx);
+                    for (int l = 0; l < (int)elementSize; l++)
+                        sprintf(values + strlen(values), "%2x ", inchar[l]);
+                    sprintf(values + strlen(values), "] != [");
+                    for (int l = 0; l < (int)elementSize; l++)
+                        sprintf(values + strlen(values), "%2x ", outchar[l]);
+                    sprintf(values + strlen(values), "]");
+                    log_error("%s\n", values);
+                    failuresPrinted++;
+                }
+
+                if (failuresPrinted > 5)
+                {
+                    log_error("Not printing further failures...\n");
+                    return -1;
+                }
+            }
+            if (j < (int)numLines * elementSize)
+            {
+                int outIdx = i
+                        * (numLines * numElementsPerLine
+                           + numLines * dstLineStride + dstPlaneStride)
+                    + j * (numElementsPerLine + dstLineStride)
+                    + numElementsPerLine * elementSize;
+                if (memcmp(((char *)outBuffer) + outIdx,
+                           ((char *)outBufferCopy) + outIdx,
+                           dstLineStride * elementSize)
+                    != 0)
+                {
+                    if (failuresPrinted == 0)
+                    {
+                        // Print first failure message
+                        log_error("ERROR: Results of copy did not validate!\n");
+                    }
+                    log_error(
+                        "3D copy corrupted data in output buffer in the line "
+                        "stride offset of plane %d line %d\n",
+                        i, j);
+                    failuresPrinted++;
+                }
+                if (failuresPrinted > 5)
+                {
+                    log_error("Not printing further failures...\n");
+                    return -1;
+                }
+            }
+        }
+        if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1)
+                * elementSize)
+        {
+            int outIdx = i
+                    * (numLines * numElementsPerLine + numLines * dstLineStride
+                       + dstPlaneStride)
+                + (numLines * elementSize) * (numElementsPerLine)
+                + (numLines * elementSize) * (dstLineStride);
+            if (memcmp(((char *)outBuffer) + outIdx,
+                       ((char *)outBufferCopy) + outIdx,
+                       dstPlaneStride * elementSize)
+                != 0)
+            {
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of copy did not validate!\n");
+                }
+                log_error("3D copy corrupted data in output buffer in the "
+                          "plane stride "
+                          "offset of plane %d\n",
+                          i);
+                failuresPrinted++;
+            }
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                return -1;
+            }
+        }
+    }
+
+    free(inBuffer);
+    free(outBuffer);
+    free(outBufferCopy);
+
+    return failuresPrinted ? -1 : 0;
+}
+
+int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, const char *kernelCode,
+                          bool localIsDst)
+{
+    ExplicitType vecType[] = {
+        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
+        kULong, kFloat, kDouble, kNumExplicitTypes
+    };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
+    unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride,
+        dstPlaneStride;
+
+    int errors = 0;
+
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    {
+        if (vecType[typeIndex] == kDouble
+            && !is_extension_available(deviceID, "cl_khr_fp64"))
+            continue;
+
+        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
+            && !gHasLong)
+            continue;
+
+        for (size = 0; vecSizes[size] != 0; size++)
+        {
+            if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
+                <= 2) // small type
+            {
+                for (srcLineStride = 0;
+                     srcLineStride < sizeof(smallTypesStrideSizes)
+                         / sizeof(smallTypesStrideSizes[0]);
+                     srcLineStride++)
+                {
+                    for (dstLineStride = 0;
+                         dstLineStride < sizeof(smallTypesStrideSizes)
+                             / sizeof(smallTypesStrideSizes[0]);
+                         dstLineStride++)
+                    {
+                        for (srcPlaneStride = 0;
+                             srcPlaneStride < sizeof(smallTypesStrideSizes)
+                                 / sizeof(smallTypesStrideSizes[0]);
+                             srcPlaneStride++)
+                        {
+                            for (dstPlaneStride = 0;
+                                 dstPlaneStride < sizeof(smallTypesStrideSizes)
+                                     / sizeof(smallTypesStrideSizes[0]);
+                                 dstPlaneStride++)
+                            {
+                                if (test_copy3D(
+                                        deviceID, context, queue, kernelCode,
+                                        vecType[typeIndex], vecSizes[size],
+                                        smallTypesStrideSizes[srcLineStride],
+                                        smallTypesStrideSizes[dstLineStride],
+                                        smallTypesStrideSizes[srcPlaneStride],
+                                        smallTypesStrideSizes[dstPlaneStride],
+                                        localIsDst))
+                                {
+                                    errors++;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            // not a small type, check only zero stride
+            else if (test_copy3D(deviceID, context, queue, kernelCode,
+                                 vecType[typeIndex], vecSizes[size], 0, 0, 0, 0,
+                                 localIsDst))
+            {
+                errors++;
+            }
+        }
+    }
+    if (errors) return -1;
+    return 0;
+}
+
+int test_async_copy_global_to_local3D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy3D_all_types(deviceID, context, queue,
+                                 async_global_to_local_kernel3D, true);
+}
+
+int test_async_copy_local_to_global3D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy3D_all_types(deviceID, context, queue,
+                                 async_local_to_global_kernel3D, false);
+}
diff --git a/test_conformance/basic/test_async_copy_fence.cpp b/test_conformance/basic/test_async_copy_fence.cpp
new file mode 100644
index 0000000000..74f6e40715
--- /dev/null
+++ b/test_conformance/basic/test_async_copy_fence.cpp
@@ -0,0 +1,812 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "../../test_common/harness/conversions.h"
+#include "procs.h"
+
+static const char *import_after_export_aliased_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *exportSrc, __global %s "
+    "*exportDst,\n"
+    "                       const __global %s *importSrc, __global %s "
+    "*importDst,\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since export src and import dst are aliased*/\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    "    int localImportOffset = exportSrcLocalSize - importSrcLocalSize;\n"
+    // Zero the local storage first
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "export src and import dst are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDst+exportSrcLocalSize*get_group_id(0)), (__local const "
+    "%s*)localBuffer, (size_t)exportSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy( (__local "
+    "%s*)(localBuffer+localImportOffset), (__global const "
+    "%s*)(importSrc+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, events );\n"
+    // Wait for the export and import to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(localBuffer+localImportOffset)[ get_local_id( 0 "
+    ")*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *import_after_export_aliased_global_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *exportSrc, __global %s "
+    "*exportDstImportSrc,\n"
+    "                       __global %s *importDst, /* there isn't a dedicated "
+    "__global %s buffer for import src since export dst and import src are "
+    "aliased*/\n"
+    "                       __local %s *exportLocalBuffer, __local %s "
+    "*importLocalBuffer,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0)), (__local "
+    "const %s*)exportLocalBuffer, (size_t)exportSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy( (__local %s*)importLocalBuffer, "
+    "(__global const "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0) + "
+    "(exportSrcLocalSize - importSrcLocalSize)), (size_t)importSrcLocalSize, "
+    "events );\n"
+    // Wait for the export and import to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *import_after_export_aliased_global_and_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *exportSrc, __global %s "
+    "*exportDstImportSrc,\n"
+    "                       __global %s *importDst, /* there isn't a dedicated "
+    "__global %s buffer for import src since export dst and import src are "
+    "aliased*/\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since export src and import dst are aliased*/\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    "    int localImportOffset = exportSrcLocalSize - importSrcLocalSize;\n"
+    // Zero the local storage first
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "export src and import dst are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0)), (__local "
+    "const %s*)localBuffer, (size_t)exportSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE | "
+    "CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy( (__local "
+    "%s*)(localBuffer+localImportOffset), (__global const "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0) + "
+    "(exportSrcLocalSize - importSrcLocalSize)), (size_t)importSrcLocalSize, "
+    "events );\n"
+    // Wait for the export and import to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(localBuffer+localImportOffset)[ get_local_id( 0 "
+    ")*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *export_after_import_aliased_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *importSrc, __global %s "
+    "*importDst,\n"
+    "                       const __global %s *exportDst, /* there isn't a "
+    "dedicated __global %s buffer for export src since the local memory is "
+    "aliased, so the export src is taken from it */\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since import dst and export src are aliased*/\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "import dst and export src are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy( (__local %s*)localBuffer, (__global "
+    "const %s*)(importSrc+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, events );\n"
+    "    async_work_group_copy_fence( CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDst+exportSrcLocalSize*get_group_id(0)), (__local const "
+    "%s*)(localBuffer + (importSrcLocalSize - exportSrcLocalSize)), "
+    "(size_t)exportSrcLocalSize, 0 );\n"
+    // Wait for the import and export to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *export_after_import_aliased_global_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *importSrcExportDst, __global %s "
+    "*importDst,\n"
+    "                       const __global %s *exportSrc,\n"
+    "                       /* there isn't a dedicated __global %s buffer for "
+    "export dst since import src and export dst are aliased */\n"
+    "                       __local %s *importLocalBuffer, __local %s "
+    "*exportLocalBuffer,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy( (__local %s*)importLocalBuffer, "
+    "(__global const "
+    "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0) + "
+    "(importSrcLocalSize - exportSrcLocalSize)), (__local const "
+    "%s*)exportLocalBuffer, (size_t)exportSrcLocalSize, events );\n"
+    // Wait for the import and export to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *export_after_import_aliased_global_and_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *importSrcExportDst, __global %s "
+    "*importDst,\n"
+    "                       /* there isn't a dedicated __global %s buffer for "
+    "export src since the local memory is aliased, so the export src is taken "
+    "from it */\n"
+    "                       /* there isn't a dedicated __global %s buffer for "
+    "export dst since import src and export dst are aliased */\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since import dst and export src are aliased*/\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "import dst and export src are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy( (__local %s*)localBuffer, (__global "
+    "const %s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE | "
+    "CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0) + "
+    "(importSrcLocalSize - exportSrcLocalSize)), (__local const "
+    "%s*)(localBuffer + (importSrcLocalSize - exportSrcLocalSize)), "
+    "(size_t)exportSrcLocalSize, events );\n"
+    // Wait for the import and export to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+int test_copy_fence(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, const char *kernelCode,
+                    ExplicitType vecType, int vecSize, bool export_after_import,
+                    bool aliased_local_mem, bool aliased_global_mem)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[4];
+    size_t threads[1], localThreads[1];
+    void *transaction1InBuffer, *transaction1OutBuffer, *transaction2InBuffer,
+        *transaction2OutBuffer;
+    MTdata d;
+    bool transaction1DstIsTransaction2Src =
+        (aliased_global_mem && !export_after_import)
+        || (aliased_local_mem && export_after_import);
+    bool transaction1SrcIsTransaction2Dst =
+        aliased_global_mem && export_after_import;
+    char vecNameString[64];
+    vecNameString[0] = 0;
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
+                vecSize);
+
+    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
+    log_info("Testing %s\n", vecNameString);
+
+    if (!is_extension_available(deviceID, "cl_khr_async_work_group_copy_fence"))
+    {
+        log_info(
+            "Device does not support extended async copies fence. Skipping "
+            "test.\n");
+        return 0;
+    }
+
+    cl_long max_local_mem_size;
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(num_of_compute_devices),
+                            &num_of_compute_devices, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096];
+    programSource[0] = 0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
+                               : "",
+            vecNameString, vecNameString, vecNameString, vecNameString,
+            vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString,
+            vecNameString, vecNameString);
+    // log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&programPtr, "test_fn");
+    test_error(error, "Unable to create testing kernel");
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(
+        kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
+        &max_workgroup_size, NULL);
+    test_error(
+        error,
+        "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(max_local_workgroup_size),
+                            max_local_workgroup_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t transaction1NumberOfCopiesPerWorkitem = 13;
+    size_t transaction2NumberOfCopiesPerWorkitem = 2;
+    elementSize =
+        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
+    size_t localStorageSpacePerWorkitem =
+        transaction1NumberOfCopiesPerWorkitem * elementSize
+        + (aliased_local_mem
+               ? 0
+               : transaction2NumberOfCopiesPerWorkitem * elementSize);
+    size_t maxLocalWorkgroupSize =
+        (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
+
+    // Calculation can return 0 on embedded devices due to 1KB local mem limit
+    if (maxLocalWorkgroupSize == 0)
+    {
+        maxLocalWorkgroupSize = 1;
+    }
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t transaction1LocalBufferSize = localWorkgroupSize * elementSize
+        * transaction1NumberOfCopiesPerWorkitem;
+    size_t transaction2LocalBufferSize = localWorkgroupSize * elementSize
+        * transaction2NumberOfCopiesPerWorkitem; // irrelevant if
+                                                 // aliased_local_mem
+    size_t numberOfLocalWorkgroups = 1111;
+    size_t transaction1GlobalBufferSize =
+        numberOfLocalWorkgroups * transaction1LocalBufferSize;
+    size_t transaction2GlobalBufferSize =
+        numberOfLocalWorkgroups * transaction2LocalBufferSize;
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+
+    transaction1InBuffer = (void *)malloc(transaction1GlobalBufferSize);
+    transaction1OutBuffer = (void *)malloc(transaction1GlobalBufferSize);
+    transaction2InBuffer = (void *)malloc(transaction2GlobalBufferSize);
+    transaction2OutBuffer = (void *)malloc(transaction2GlobalBufferSize);
+    memset(transaction1OutBuffer, 0, transaction1GlobalBufferSize);
+    memset(transaction2OutBuffer, 0, transaction2GlobalBufferSize);
+
+    cl_int transaction1CopiesPerWorkitemInt, transaction1CopiesPerWorkgroup,
+        transaction2CopiesPerWorkitemInt, transaction2CopiesPerWorkgroup;
+    transaction1CopiesPerWorkitemInt =
+        (int)transaction1NumberOfCopiesPerWorkitem;
+    transaction1CopiesPerWorkgroup =
+        (int)(transaction1NumberOfCopiesPerWorkitem * localWorkgroupSize);
+    transaction2CopiesPerWorkitemInt =
+        (int)transaction2NumberOfCopiesPerWorkitem;
+    transaction2CopiesPerWorkgroup =
+        (int)(transaction2NumberOfCopiesPerWorkitem * localWorkgroupSize);
+
+    log_info(
+        "Global: %d, local %d. 1st Transaction: local buffer %db, global "
+        "buffer %db, each work group will copy %d elements and each work "
+        "item item will copy %d elements. 2nd Transaction: local buffer "
+        "%db, global buffer %db, each work group will copy %d elements and "
+        "each work item will copy %d elements\n",
+        (int)globalWorkgroupSize, (int)localWorkgroupSize,
+        (int)transaction1LocalBufferSize, (int)transaction1GlobalBufferSize,
+        transaction1CopiesPerWorkgroup, transaction1CopiesPerWorkitemInt,
+        (int)transaction2LocalBufferSize, (int)transaction2GlobalBufferSize,
+        transaction2CopiesPerWorkgroup, transaction2CopiesPerWorkitemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand(gRandomSeed);
+    generate_random_data(
+        vecType, transaction1GlobalBufferSize / get_explicit_type_size(vecType),
+        d, transaction1InBuffer);
+    if (!transaction1DstIsTransaction2Src)
+    {
+        generate_random_data(vecType,
+                             transaction2GlobalBufferSize
+                                 / get_explicit_type_size(vecType),
+                             d, transaction2InBuffer);
+    }
+    free_mtdata(d);
+    d = NULL;
+
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                transaction1GlobalBufferSize,
+                                transaction1InBuffer, &error);
+    test_error(error, "Unable to create input buffer");
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                transaction1GlobalBufferSize,
+                                transaction1OutBuffer, &error);
+    test_error(error, "Unable to create output buffer");
+    if (!transaction1DstIsTransaction2Src)
+    {
+        streams[2] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                    transaction2GlobalBufferSize,
+                                    transaction2InBuffer, &error);
+        test_error(error, "Unable to create input buffer");
+    }
+    if (!transaction1SrcIsTransaction2Dst)
+    {
+        streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                    transaction2GlobalBufferSize,
+                                    transaction2OutBuffer, &error);
+        test_error(error, "Unable to create output buffer");
+    }
+
+    cl_uint argIndex = 0;
+    error = clSetKernelArg(kernel, argIndex, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error = clSetKernelArg(kernel, argIndex, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    if (!transaction1DstIsTransaction2Src)
+    {
+        error =
+            clSetKernelArg(kernel, argIndex, sizeof(streams[2]), &streams[2]);
+        test_error(error, "Unable to set kernel argument");
+        ++argIndex;
+    }
+    if (!transaction1SrcIsTransaction2Dst)
+    {
+        error =
+            clSetKernelArg(kernel, argIndex, sizeof(streams[3]), &streams[3]);
+        test_error(error, "Unable to set kernel argument");
+        ++argIndex;
+    }
+    error = clSetKernelArg(kernel, argIndex, transaction1LocalBufferSize, NULL);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    if (!aliased_local_mem)
+    {
+        error =
+            clSetKernelArg(kernel, argIndex, transaction2LocalBufferSize, NULL);
+        test_error(error, "Unable to set kernel argument");
+        ++argIndex;
+    }
+    error =
+        clSetKernelArg(kernel, argIndex, sizeof(transaction1CopiesPerWorkgroup),
+                       &transaction1CopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error = clSetKernelArg(kernel, argIndex,
+                           sizeof(transaction1CopiesPerWorkitemInt),
+                           &transaction1CopiesPerWorkitemInt);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error =
+        clSetKernelArg(kernel, argIndex, sizeof(transaction2CopiesPerWorkgroup),
+                       &transaction2CopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error = clSetKernelArg(kernel, argIndex,
+                           sizeof(transaction2CopiesPerWorkitemInt),
+                           &transaction2CopiesPerWorkitemInt);
+    test_error(error, "Unable to set kernel argument");
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to queue kernel");
+
+    // Read
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                transaction1GlobalBufferSize,
+                                transaction1OutBuffer, 0, NULL, NULL);
+    test_error(error, "Unable to read results");
+    if (transaction1DstIsTransaction2Src)
+    {
+        for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++)
+        {
+            memcpy(
+                (void *)((unsigned char *)transaction2InBuffer
+                         + idx * transaction2CopiesPerWorkgroup * elementSize),
+                (const void *)((unsigned char *)transaction1OutBuffer
+                               + (idx * transaction1CopiesPerWorkgroup
+                                  + (transaction1CopiesPerWorkgroup
+                                     - transaction2CopiesPerWorkgroup))
+                                   * elementSize),
+                (size_t)transaction2CopiesPerWorkgroup * elementSize);
+        }
+    }
+    if (transaction1SrcIsTransaction2Dst)
+    {
+        void *transaction1SrcBuffer =
+            (void *)malloc(transaction1GlobalBufferSize);
+        error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                                    transaction1GlobalBufferSize,
+                                    transaction1SrcBuffer, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
+        for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++)
+        {
+            memcpy(
+                (void *)((unsigned char *)transaction2OutBuffer
+                         + idx * transaction2CopiesPerWorkgroup * elementSize),
+                (const void *)((unsigned char *)transaction1SrcBuffer
+                               + (idx * transaction1CopiesPerWorkgroup
+                                  + (transaction1CopiesPerWorkgroup
+                                     - transaction2CopiesPerWorkgroup))
+                                   * elementSize),
+                (size_t)transaction2CopiesPerWorkgroup * elementSize);
+        }
+        free(transaction1SrcBuffer);
+    }
+    else
+    {
+        error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
+                                    transaction2GlobalBufferSize,
+                                    transaction2OutBuffer, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
+    }
+
+    // Verify
+    int failuresPrinted = 0;
+    if (memcmp(transaction1InBuffer, transaction1OutBuffer,
+               transaction1GlobalBufferSize)
+        != 0)
+    {
+        size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+        unsigned char *inchar = (unsigned char *)transaction1InBuffer;
+        unsigned char *outchar = (unsigned char *)transaction1OutBuffer;
+        for (int i = 0; i < (int)transaction1GlobalBufferSize;
+             i += (int)elementSize)
+        {
+            if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize)
+                != 0)
+            {
+                char values[4096];
+                values[0] = 0;
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of 1st transaction did not "
+                              "validate!\n");
+                }
+                sprintf(values + strlen(values), "%d -> [", i);
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", inchar[i + j]);
+                sprintf(values + strlen(values), "] != [");
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", outchar[i + j]);
+                sprintf(values + strlen(values), "]");
+                log_error("%s\n", values);
+                failuresPrinted++;
+            }
+
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                break;
+            }
+        }
+    }
+    if (memcmp(transaction2InBuffer, transaction2OutBuffer,
+               transaction2GlobalBufferSize)
+        != 0)
+    {
+        size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+        unsigned char *inchar = (unsigned char *)transaction2InBuffer;
+        unsigned char *outchar = (unsigned char *)transaction2OutBuffer;
+        for (int i = 0; i < (int)transaction2GlobalBufferSize;
+             i += (int)elementSize)
+        {
+            if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize)
+                != 0)
+            {
+                char values[4096];
+                values[0] = 0;
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of 2nd transaction did not "
+                              "validate!\n");
+                }
+                sprintf(values + strlen(values), "%d -> [", i);
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", inchar[i + j]);
+                sprintf(values + strlen(values), "] != [");
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", outchar[i + j]);
+                sprintf(values + strlen(values), "]");
+                log_error("%s\n", values);
+                failuresPrinted++;
+            }
+
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                break;
+            }
+        }
+    }
+
+    free(transaction1InBuffer);
+    free(transaction1OutBuffer);
+    free(transaction2InBuffer);
+    free(transaction2OutBuffer);
+
+    return failuresPrinted ? -1 : 0;
+}
+
+int test_copy_fence_all_types(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, const char *kernelCode,
+                              bool export_after_import, bool aliased_local_mem,
+                              bool aliased_global_mem)
+{
+    ExplicitType vecType[] = {
+        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
+        kULong, kFloat, kDouble, kNumExplicitTypes
+    };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int size, typeIndex;
+
+    int errors = 0;
+
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    {
+        if (vecType[typeIndex] == kDouble
+            && !is_extension_available(deviceID, "cl_khr_fp64"))
+            continue;
+
+        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
+            && !gHasLong)
+            continue;
+
+        for (size = 0; vecSizes[size] != 0; size++)
+        {
+            if (test_copy_fence(deviceID, context, queue, kernelCode,
+                                vecType[typeIndex], vecSizes[size],
+                                export_after_import, aliased_local_mem,
+                                aliased_global_mem))
+            {
+                errors++;
+            }
+        }
+    }
+    if (errors) return -1;
+    return 0;
+}
+
+int test_async_work_group_copy_fence_import_after_export_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     import_after_export_aliased_local_kernel,
+                                     false, true, false);
+}
+
+int test_async_work_group_copy_fence_import_after_export_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     import_after_export_aliased_global_kernel,
+                                     false, false, true);
+}
+
+int test_async_work_group_copy_fence_import_after_export_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(
+        deviceID, context, queue,
+        import_after_export_aliased_global_and_local_kernel, false, true, true);
+}
+
+int test_async_work_group_copy_fence_export_after_import_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     export_after_import_aliased_local_kernel,
+                                     true, true, false);
+}
+
+int test_async_work_group_copy_fence_export_after_import_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     export_after_import_aliased_global_kernel,
+                                     true, false, true);
+}
+
+int test_async_work_group_copy_fence_export_after_import_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(
+        deviceID, context, queue,
+        export_after_import_aliased_global_and_local_kernel, true, true, true);
+}

From dbd3e787fe4cf977c2da7c58f7505918eb293e6c Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 18 Mar 2021 21:46:05 +0000
Subject: [PATCH 064/158] Do not dereference null pointer for no matching tests
 (#1191)

When invoking for example

    test_c11_atomics test-that-does-not-exist

parseAndCallCommandLineTests() would attempt to dereference
`resultTestList` which is still a null pointer.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
---
 test_common/harness/testHarness.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 6b4c720197..5d96c43f56 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -713,20 +713,20 @@ int parseAndCallCommandLineTests(int argc, const char *argv[],
             ret = saveResultsToJson(filename, argv[0], testList,
                                     selectedTestList, resultTestList, testNum);
         }
-    }
 
-    if (std::any_of(resultTestList, resultTestList + testNum,
-                    [](test_status result) {
-                        switch (result)
-                        {
-                            case TEST_PASS:
-                            case TEST_SKIP: return false;
-                            case TEST_FAIL:
-                            default: return true;
-                        };
-                    }))
-    {
-        ret = EXIT_FAILURE;
+        if (std::any_of(resultTestList, resultTestList + testNum,
+                        [](test_status result) {
+                            switch (result)
+                            {
+                                case TEST_PASS:
+                                case TEST_SKIP: return false;
+                                case TEST_FAIL:
+                                default: return true;
+                            };
+                        }))
+        {
+            ret = EXIT_FAILURE;
+        }
     }
 
     free(selectedTestList);

From ef19796590d85dcea3959d971e934acaa7207d77 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 24 Mar 2021 16:19:37 +0000
Subject: [PATCH 065/158] Improve log consistency (#1196)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/main.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 9d0b615ae3..80d4b5b838 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -1264,7 +1264,7 @@ test_status InitCL(cl_device_id device)
         clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn2, &error);
     if (gInBuffer2 == NULL || error)
     {
-        vlog_error("clCreateArray2 failed for input (%d)\n", error);
+        vlog_error("clCreateBuffer2 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
@@ -1272,7 +1272,7 @@ test_status InitCL(cl_device_id device)
         clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn3, &error);
     if (gInBuffer3 == NULL || error)
     {
-        vlog_error("clCreateArray3 failed for input (%d)\n", error);
+        vlog_error("clCreateBuffer3 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
@@ -1290,14 +1290,14 @@ test_status InitCL(cl_device_id device)
                                        gOut[i], &error);
         if (gOutBuffer[i] == NULL || error)
         {
-            vlog_error("clCreateArray failed for output (%d)\n", error);
+            vlog_error("clCreateBuffer failed for output (%d)\n", error);
             return TEST_FAIL;
         }
         gOutBuffer2[i] = clCreateBuffer(gContext, device_flags, BUFFER_SIZE,
                                         gOut2[i], &error);
         if (gOutBuffer2[i] == NULL || error)
         {
-            vlog_error("clCreateArray2 failed for output (%d)\n", error);
+            vlog_error("clCreateBuffer2 failed for output (%d)\n", error);
             return TEST_FAIL;
         }
     }

From 8488d4b2dee231690691e997674642f667d12a86 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 24 Mar 2021 16:20:05 +0000
Subject: [PATCH 066/158] Use the same kernel name for all tests (#1194)

Make signature of BuildKernel more consistent across tests: now only two
variants exist:

  int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                  cl_program *p, bool relaxedMode)

or

  int BuildKernel(const char *name/symbol, int vectorSize,
                  cl_uint kernel_count, cl_kernel *k, cl_program *p,
                  bool relaxedMode)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../binary_operator_double.cpp                | 32 +++++++------------
 .../binary_operator_float.cpp                 | 30 +++++++----------
 2 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index cdb4663629..a02e53ba36 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -20,14 +20,12 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, const char *operator_symbol,
-                       int vectorSize, cl_uint kernel_count, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *operator_symbol, int vectorSize,
+                       cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                       bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void ",
-                        name,
-                        "_kernel",
+                        "__kernel void math_kernel",
                         sizeNames[vectorSize],
                         "( __global double",
                         sizeNames[vectorSize],
@@ -38,16 +36,14 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                         "* in2 )\n"
                         "{\n"
                         "   size_t i = get_global_id(0);\n"
-                        "   out[i] =  in1[i] ",
+                        "   out[i] = in1[i] ",
                         operator_symbol,
                         " in2[i];\n"
                         "}\n" };
 
     const char *c3[] = {
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void ",
-        name,
-        "_kernel",
+        "__kernel void math_kernel",
         sizeNames[vectorSize],
         "( __global double* out, __global double* in, __global double* in2)\n"
         "{\n"
@@ -105,7 +101,7 @@ static int BuildKernel(const char *name, const char *operator_symbol,
     }
 
     char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
@@ -118,7 +114,6 @@ typedef struct BuildKernelInfo
     cl_uint kernel_count;
     cl_kernel **kernels;
     cl_program *programs;
-    const char *name;
     const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
@@ -127,7 +122,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernel(info->name, info->operator_symbol, i, info->kernel_count,
+    return BuildKernel(info->operator_symbol, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
@@ -403,13 +398,10 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
 
     // Init the kernels
     {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex,
-                                       test_info.threadCount,
-                                       test_info.k,
-                                       test_info.programs,
-                                       f->name,
-                                       f->nameInCode,
-                                       relaxedMode };
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index eba3ee6650..39070cb2f2 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -20,13 +20,11 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, const char *operator_symbol,
-                       int vectorSize, cl_uint kernel_count, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+static int BuildKernel(const char *operator_symbol, int vectorSize,
+                       cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                       bool relaxedMode)
 {
-    const char *c[] = { "__kernel void ",
-                        name,
-                        "_kernel",
+    const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
                         "( __global float",
                         sizeNames[vectorSize],
@@ -43,9 +41,7 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                         "}\n" };
 
     const char *c3[] = {
-        "__kernel void ",
-        name,
-        "_kernel",
+        "__kernel void math_kernel",
         sizeNames[vectorSize],
         "( __global float* out, __global float* in, __global float* in2)\n"
         "{\n"
@@ -103,7 +99,7 @@ static int BuildKernel(const char *name, const char *operator_symbol,
     }
 
     char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
@@ -116,7 +112,6 @@ typedef struct BuildKernelInfo
     cl_uint kernel_count;
     cl_kernel **kernels;
     cl_program *programs;
-    const char *name;
     const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
@@ -125,7 +120,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
-    return BuildKernel(info->name, info->operator_symbol, i, info->kernel_count,
+    return BuildKernel(info->operator_symbol, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
@@ -395,13 +390,10 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
 
     // Init the kernels
     {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex,
-                                       test_info.threadCount,
-                                       test_info.k,
-                                       test_info.programs,
-                                       f->name,
-                                       f->nameInCode,
-                                       relaxedMode };
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
         if ((error = ThreadPool_Do(BuildKernelFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))

From 5281b4c916bd7ab540dd9c1c7a199a7dba52165f Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 24 Mar 2021 16:29:25 +0000
Subject: [PATCH 067/158] Improve consistency of clEnqueueWriteBuffer
 operations (#1195)

Reduce differences between tests by ensuring all these operations are
non-blocking.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_two_results_i_double.cpp | 12 ++++++------
 .../math_brute_force/binary_two_results_i_float.cpp  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index f1a27ee36e..d71585e65f 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -210,14 +210,14 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
@@ -230,7 +230,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
             if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
                                           bufferSize, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
@@ -240,7 +240,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
             if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
                                           bufferSize, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
@@ -573,13 +573,13 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         double *p = (double *)gIn;
         for (j = 0; j < bufferSize / sizeof(cl_double); j++)
             p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index 9db5c1c1c2..b6f1f1bdf0 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -218,7 +218,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
 
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);

From baa226a779ad3f1018c270607a3306c0095da134 Mon Sep 17 00:00:00 2001
From: James Price <jamesprice.dev@gmail.com>
Date: Tue, 6 Apr 2021 11:42:46 -0400
Subject: [PATCH 068/158] Remove compiler options that are specific to Apple
 (#1114)

These are causing test failures for non-Apple implementations of
OpenCL running on macOS.
---
 test_conformance/compiler/test_build_options.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test_conformance/compiler/test_build_options.cpp b/test_conformance/compiler/test_build_options.cpp
index 5bd9411523..c25fd10fe7 100644
--- a/test_conformance/compiler/test_build_options.cpp
+++ b/test_conformance/compiler/test_build_options.cpp
@@ -61,10 +61,6 @@ const char *optimization_options[] = {
     "-cl-fast-relaxed-math",
     "-w",
     "-Werror",
-#if defined( __APPLE__ )
-    "-cl-opt-enable",
-    "-cl-auto-vectorize-enable"
-#endif
     };
 
 cl_int get_result_from_program( cl_context context, cl_command_queue queue, cl_program program, cl_int *outValue )

From c5e4ca6c913d582855bd6aa60642e225f31df25a Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 6 Apr 2021 16:43:55 +0100
Subject: [PATCH 069/158] Remove unnecessary APPLE specific code (#1188)

To reduce differences between tests, remove APPLE specific code from
unary tests as no other test have similar logic.

Ensure gMeasureTimes is consistently initialised regardless of operating
systems to ensure a consistent command line interface.

The remaining APPLE specific pieces of code relate either to include
paths, or to the implementation of PreventSleep(), ResumeSleep() and
GetTime(). Those are not removed in this commit.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/main.cpp    | 20 -------------------
 .../math_brute_force/unary_double.cpp         | 20 -------------------
 .../math_brute_force/unary_float.cpp          |  4 ----
 3 files changed, 44 deletions(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 80d4b5b838..0e1b40a443 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -62,11 +62,7 @@ static int32_t gEndTestNumber = -1;
 int gSkipCorrectnessTesting = 0;
 int gStopOnError = 0;
 static bool gSkipRestOfTests;
-#if defined(__APPLE__)
-int gMeasureTimes = 1;
-#else
 int gMeasureTimes = 0;
-#endif
 int gReportAverageTimes = 0;
 int gForceFTZ = 0;
 int gWimpyMode = 0;
@@ -805,11 +801,6 @@ int main(int argc, const char *argv[])
         return -1;
     }
 
-#if defined(__APPLE__)
-    struct timeval startTime;
-    gettimeofday(&startTime, NULL);
-#endif
-
     error = ParseArgs(argc, argv);
     if (error) return error;
 
@@ -866,14 +857,6 @@ int main(int argc, const char *argv[])
 
     ReleaseCL();
 
-#if defined(__APPLE__)
-    struct timeval endTime;
-    gettimeofday(&endTime, NULL);
-    double time = (double)endTime.tv_sec - (double)startTime.tv_sec;
-    time += 1e-6 * ((double)endTime.tv_usec - (double)startTime.tv_usec);
-    vlog("time: %f s\n", time);
-#endif
-
     return ret;
 }
 
@@ -1211,9 +1194,6 @@ test_status InitCL(cl_device_id device)
         return TEST_FAIL;
     }
 
-#if defined(__APPLE__)
-    // FIXME: use clProtectedArray
-#endif
     // Allocate buffers
     cl_uint min_alignment = 0;
     error = clGetDeviceInfo(gDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN,
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index 1074e9cafd..615d0fb950 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -20,10 +20,6 @@
 
 #include <cstring>
 
-#if defined(__APPLE__)
-#include <sys/time.h>
-#endif
-
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
@@ -166,12 +162,6 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
-#if defined(__APPLE__)
-    struct timeval time_val;
-    gettimeofday(&time_val, NULL);
-    double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
-    double end_time;
-#endif
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     // Init test_info
@@ -302,12 +292,6 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-
-#if defined(__APPLE__)
-    gettimeofday(&time_val, NULL);
-    end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
-#endif
-
     if (gMeasureTimes)
     {
         // Init input array
@@ -385,10 +369,6 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     }
 
     if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-
-#if defined(__APPLE__)
-    vlog("\t(%2.2f seconds)", end_time - start_time);
-#endif
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 126db73eaa..3666589230 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -20,10 +20,6 @@
 
 #include <cstring>
 
-#if defined(__APPLE__)
-#include <sys/time.h>
-#endif
-
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {

From 71bef8563ec1580411f9cf3ca76b05e6fdbe1263 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Tue, 6 Apr 2021 18:25:48 +0200
Subject: [PATCH 070/158] New subgroups - full changes set (#1074)

* Extended subgroups - extended types types

* Extended subgroups - non uniform vote tests

* Extended subgroups - non uniform arithmetic tests

* Extended subgroups - ballot tests

* Extended subgroups - clustered reduce tests

* Extended subgroups - shuffle tests

* Extended subgroups - formating issues

* Extended subgroups - review fixes

* Extended subgroups - review fixes

Fixed: removed additional brakes, kernel_sstr

* Extended subgroups - fix macos build error

* Extended subgroups - review fixes

Fixed: mac os build error

* Extended subgroups - data type verification example

* Extended subgroups - error unification

* Extended subgroups - fix header years

* Extended subgroups - use is_half_nan

* Extended subgroups - compare half as float

* Review fixes mostly for ballot functions.

- Modify kernels for better handling active/inactive workitems
- Modify gen/chk functions for handling non uniform workgroup sizes
- Introduce new variables naming convention
- minor fixes

* Extended subgroups - simplification data generation for ballot lsb/msb functions

* Extended subgroups - minor fixes

* Extended subgroups - move common code to function

* Extended subgroups - formatting errors fix

* Extended subgroups - fix build error

* Extended subgroups - sub_group_elect more sophisticated

Define mask which is 4bytes pattern where bit 1 means work item is active.
If workitem in subgroup matches pattern then run sub_group_elect()

* Extended subgroups - fix Ubuntu build error

* Extended subgroups - voting function review fixes

* adjust all function for using masks
* remove calculate templates
* merge code to one common template
* check results only in active workitems
* normalize values on host side
* minor fixes

* Extended subgroups - fix typos

* Set of fixes and improvements after review

* define WorkGroupParams to stop extended parameters list in function
* better workitems mask handing (WorkGroupParams)
* narrow values of data input generation to avoid overflows (arithmetic func)
* implement work item masks for arithmetic functions
* enable half type testing for reduction/scan/broadcast
* minor fixes

* Extended subgroups - fix Linux issues

* Extended subgroups - fix sub_group_local_id data type

* Extended subgroups - use vector instead of array.

* Extended subgroups - change names to subgroup

* Extended subgroups - uncomment code, fix build

* Extended subgroups - build fix, use cl_half_from_float func

* Extended subgroups - remove is_half_nan

* Extended subgroups - do no use undef min/max

* Extended subgroups - use parenthesis, fix formatting
---
 test_conformance/subgroups/CMakeLists.txt     |   10 +-
 test_conformance/subgroups/main.cpp           |   13 +-
 test_conformance/subgroups/procs.h            |   42 +-
 .../subgroups/subgroup_common_kernels.cpp     |  106 ++
 .../subgroups/subgroup_common_kernels.h       |   32 +
 .../subgroups/subgroup_common_templates.h     |  911 +++++++++++
 test_conformance/subgroups/subhelpers.h       | 1326 +++++++++++++++--
 test_conformance/subgroups/test_barrier.cpp   |   46 +-
 test_conformance/subgroups/test_ifp.cpp       |   32 +-
 test_conformance/subgroups/test_subgroup.cpp  |  217 +++
 .../subgroups/test_subgroup_ballot.cpp        | 1089 ++++++++++++++
 .../test_subgroup_clustered_reduce.cpp        |  340 +++++
 .../test_subgroup_extended_types.cpp          |  138 ++
 .../test_subgroup_non_uniform_arithmetic.cpp  |  473 ++++++
 .../test_subgroup_non_uniform_vote.cpp        |  303 ++++
 .../subgroups/test_subgroup_shuffle.cpp       |   78 +
 .../test_subgroup_shuffle_relative.cpp        |   81 +
 test_conformance/subgroups/test_workgroup.cpp |  727 ---------
 18 files changed, 5041 insertions(+), 923 deletions(-)
 create mode 100644 test_conformance/subgroups/subgroup_common_kernels.cpp
 create mode 100644 test_conformance/subgroups/subgroup_common_kernels.h
 create mode 100644 test_conformance/subgroups/subgroup_common_templates.h
 create mode 100644 test_conformance/subgroups/test_subgroup.cpp
 create mode 100644 test_conformance/subgroups/test_subgroup_ballot.cpp
 create mode 100644 test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
 create mode 100644 test_conformance/subgroups/test_subgroup_extended_types.cpp
 create mode 100644 test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
 create mode 100644 test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
 create mode 100644 test_conformance/subgroups/test_subgroup_shuffle.cpp
 create mode 100644 test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
 delete mode 100644 test_conformance/subgroups/test_workgroup.cpp

diff --git a/test_conformance/subgroups/CMakeLists.txt b/test_conformance/subgroups/CMakeLists.txt
index eb6a6079c2..d48af9ccfc 100644
--- a/test_conformance/subgroups/CMakeLists.txt
+++ b/test_conformance/subgroups/CMakeLists.txt
@@ -5,8 +5,16 @@ set(${MODULE_NAME}_SOURCES
     test_barrier.cpp
     test_queries.cpp
     test_workitem.cpp
-    test_workgroup.cpp
+    test_subgroup.cpp
     test_ifp.cpp
+    test_subgroup_extended_types.cpp
+    subgroup_common_kernels.cpp
+    test_subgroup_non_uniform_vote.cpp
+    test_subgroup_non_uniform_arithmetic.cpp
+    test_subgroup_ballot.cpp
+    test_subgroup_clustered_reduce.cpp
+    test_subgroup_shuffle.cpp
+    test_subgroup_shuffle_relative.cpp
 )
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/subgroups/main.cpp b/test_conformance/subgroups/main.cpp
index f9a9a9d515..44416dd729 100644
--- a/test_conformance/subgroups/main.cpp
+++ b/test_conformance/subgroups/main.cpp
@@ -27,12 +27,19 @@ test_definition test_list[] = {
     ADD_TEST_VERSION(sub_group_info_core, Version(2, 1)),
     ADD_TEST_VERSION(work_item_functions_ext, Version(2, 0)),
     ADD_TEST_VERSION(work_item_functions_core, Version(2, 1)),
-    ADD_TEST_VERSION(work_group_functions_ext, Version(2, 0)),
-    ADD_TEST_VERSION(work_group_functions_core, Version(2, 1)),
+    ADD_TEST_VERSION(subgroup_functions_ext, Version(2, 0)),
+    ADD_TEST_VERSION(subgroup_functions_core, Version(2, 1)),
     ADD_TEST_VERSION(barrier_functions_ext, Version(2, 0)),
     ADD_TEST_VERSION(barrier_functions_core, Version(2, 1)),
     ADD_TEST_VERSION(ifp_ext, Version(2, 0)),
-    ADD_TEST_VERSION(ifp_core, Version(2, 1))
+    ADD_TEST_VERSION(ifp_core, Version(2, 1)),
+    ADD_TEST(subgroup_functions_extended_types),
+    ADD_TEST(subgroup_functions_non_uniform_vote),
+    ADD_TEST(subgroup_functions_non_uniform_arithmetic),
+    ADD_TEST(subgroup_functions_ballot),
+    ADD_TEST(subgroup_functions_clustered_reduce),
+    ADD_TEST(subgroup_functions_shuffle),
+    ADD_TEST(subgroup_functions_shuffle_relative)
 };
 
 const int test_num = ARRAY_SIZE(test_list);
diff --git a/test_conformance/subgroups/procs.h b/test_conformance/subgroups/procs.h
index 3ebb13b53e..d09e8242f6 100644
--- a/test_conformance/subgroups/procs.h
+++ b/test_conformance/subgroups/procs.h
@@ -37,14 +37,12 @@ extern int test_work_item_functions_core(cl_device_id device,
                                          cl_context context,
                                          cl_command_queue queue,
                                          int num_elements);
-extern int test_work_group_functions_ext(cl_device_id device,
-                                         cl_context context,
-                                         cl_command_queue queue,
-                                         int num_elements);
-extern int test_work_group_functions_core(cl_device_id device,
-                                          cl_context context,
-                                          cl_command_queue queue,
-                                          int num_elements);
+extern int test_subgroup_functions_ext(cl_device_id device, cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_subgroup_functions_core(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
 extern int test_barrier_functions_ext(cl_device_id device, cl_context context,
                                       cl_command_queue queue, int num_elements);
 extern int test_barrier_functions_core(cl_device_id device, cl_context context,
@@ -56,5 +54,31 @@ extern int test_ifp_ext(cl_device_id device, cl_context context,
                         cl_command_queue queue, int num_elements);
 extern int test_ifp_core(cl_device_id device, cl_context context,
                          cl_command_queue queue, int num_elements);
-
+extern int test_subgroup_functions_extended_types(cl_device_id device,
+                                                  cl_context context,
+                                                  cl_command_queue queue,
+                                                  int num_elements);
+extern int test_subgroup_functions_non_uniform_vote(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_subgroup_functions_non_uniform_arithmetic(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_subgroup_functions_ballot(cl_device_id device,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements);
+extern int test_subgroup_functions_clustered_reduce(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_subgroup_functions_shuffle(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements);
+extern int test_subgroup_functions_shuffle_relative(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
 #endif /*_procs_h*/
diff --git a/test_conformance/subgroups/subgroup_common_kernels.cpp b/test_conformance/subgroups/subgroup_common_kernels.cpp
new file mode 100644
index 0000000000..f8b244504a
--- /dev/null
+++ b/test_conformance/subgroups/subgroup_common_kernels.cpp
@@ -0,0 +1,106 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "subgroup_common_kernels.h"
+
+const char* bcast_source =
+    "__kernel void test_bcast(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint which_sub_group_local_id = xy[gid].z;\n"
+    "    out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
+
+    "}\n";
+
+const char* redadd_source = "__kernel void test_redadd(const __global Type "
+                            "*in, __global int4 *xy, __global Type *out)\n"
+                            "{\n"
+                            "    int gid = get_global_id(0);\n"
+                            "    XY(xy,gid);\n"
+                            "    out[gid] = sub_group_reduce_add(in[gid]);\n"
+                            "}\n";
+
+const char* redmax_source = "__kernel void test_redmax(const __global Type "
+                            "*in, __global int4 *xy, __global Type *out)\n"
+                            "{\n"
+                            "    int gid = get_global_id(0);\n"
+                            "    XY(xy,gid);\n"
+                            "    out[gid] = sub_group_reduce_max(in[gid]);\n"
+                            "}\n";
+
+const char* redmin_source = "__kernel void test_redmin(const __global Type "
+                            "*in, __global int4 *xy, __global Type *out)\n"
+                            "{\n"
+                            "    int gid = get_global_id(0);\n"
+                            "    XY(xy,gid);\n"
+                            "    out[gid] = sub_group_reduce_min(in[gid]);\n"
+                            "}\n";
+
+const char* scinadd_source =
+    "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
+    "}\n";
+
+const char* scinmax_source =
+    "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
+    "}\n";
+
+const char* scinmin_source =
+    "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
+    "}\n";
+
+const char* scexadd_source =
+    "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
+    "}\n";
+
+const char* scexmax_source =
+    "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
+    "}\n";
+
+const char* scexmin_source =
+    "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
+    "}\n";
diff --git a/test_conformance/subgroups/subgroup_common_kernels.h b/test_conformance/subgroups/subgroup_common_kernels.h
new file mode 100644
index 0000000000..8ae97d9a36
--- /dev/null
+++ b/test_conformance/subgroups/subgroup_common_kernels.h
@@ -0,0 +1,32 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef SUBGROUPKERNELSOURCES_H
+#define SUBGROUPKERNELSOURCES_H
+#include "subhelpers.h"
+
+
+extern const char* bcast_source;
+extern const char* redadd_source;
+extern const char* redmax_source;
+extern const char* redmin_source;
+extern const char* scinadd_source;
+extern const char* scinmax_source;
+extern const char* scinmin_source;
+extern const char* scexadd_source;
+extern const char* scexmax_source;
+extern const char* scexmin_source;
+
+#endif
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
new file mode 100644
index 0000000000..b30c416b1a
--- /dev/null
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -0,0 +1,911 @@
+//
+// Copyright (c) 2020 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef SUBGROUPCOMMONTEMPLATES_H
+#define SUBGROUPCOMMONTEMPLATES_H
+
+#include "typeWrappers.h"
+#include <bitset>
+#include "CL/cl_half.h"
+#include "subhelpers.h"
+
+#include <set>
+
+typedef std::bitset<128> bs128;
+static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
+                                  const std::string &mask_type,
+                                  cl_uint max_sub_group_size)
+{
+    bs128 mask128;
+    cl_uint4 mask;
+    cl_uint pos = subgroup_local_id;
+    if (mask_type == "eq") mask128.set(pos);
+    if (mask_type == "le" || mask_type == "lt")
+    {
+        for (cl_uint i = 0; i <= pos; i++) mask128.set(i);
+        if (mask_type == "lt") mask128.reset(pos);
+    }
+    if (mask_type == "ge" || mask_type == "gt")
+    {
+        for (cl_uint i = pos; i < max_sub_group_size; i++) mask128.set(i);
+        if (mask_type == "gt") mask128.reset(pos);
+    }
+
+    // convert std::bitset<128> to uint4
+    auto const uint_mask = bs128{ static_cast<unsigned long>(-1) };
+    mask.s0 = (mask128 & uint_mask).to_ulong();
+    mask128 >>= 32;
+    mask.s1 = (mask128 & uint_mask).to_ulong();
+    mask128 >>= 32;
+    mask.s2 = (mask128 & uint_mask).to_ulong();
+    mask128 >>= 32;
+    mask.s3 = (mask128 & uint_mask).to_ulong();
+
+    return mask;
+}
+
+// DESCRIPTION :
+// sub_group_broadcast - each work_item registers it's own value.
+// All work_items in subgroup takes one value from only one (any) work_item
+// sub_group_broadcast_first - same as type 0. All work_items in
+// subgroup takes only one value from only one chosen (the smallest subgroup ID)
+// work_item
+// sub_group_non_uniform_broadcast - same as type 0 but
+// only 4 work_items from subgroup enter the code (are active)
+template <typename Ty, SubgroupsBroadcastOp operation> struct BC
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        int d = ns > 100 ? 100 : ns;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        int last_subgroup_size = 0;
+        ii = 0;
+
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+            ng++;
+        }
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+                int bcast_if = 0;
+                int bcast_elseif = 0;
+                int bcast_index = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                    % (d > n ? n : d);
+                // l - calculate subgroup local id from which value will be
+                // broadcasted (one the same value for whole subgroup)
+                if (operation != SubgroupsBroadcastOp::broadcast)
+                {
+                    // reduce brodcasting index in case of non_uniform and
+                    // last workgroup last subgroup
+                    if (last_subgroup_size && j == nj - 1
+                        && last_subgroup_size < NR_OF_ACTIVE_WORK_ITEMS)
+                    {
+                        bcast_if = bcast_index % last_subgroup_size;
+                        bcast_elseif = bcast_if;
+                    }
+                    else
+                    {
+                        bcast_if = bcast_index % NR_OF_ACTIVE_WORK_ITEMS;
+                        bcast_elseif = NR_OF_ACTIVE_WORK_ITEMS
+                            + bcast_index % (n - NR_OF_ACTIVE_WORK_ITEMS);
+                    }
+                }
+
+                for (i = 0; i < n; ++i)
+                {
+                    if (operation == SubgroupsBroadcastOp::broadcast)
+                    {
+                        int midx = 4 * ii + 4 * i + 2;
+                        m[midx] = (cl_int)bcast_index;
+                    }
+                    else
+                    {
+                        if (i < NR_OF_ACTIVE_WORK_ITEMS)
+                        {
+                            // index of the third
+                            // element int the vector.
+                            int midx = 4 * ii + 4 * i + 2;
+                            // storing information about
+                            // broadcasting index -
+                            // earlier calculated
+                            m[midx] = (cl_int)bcast_if;
+                        }
+                        else
+                        { // index of the third
+                          // element int the vector.
+                            int midx = 4 * ii + 4 * i + 3;
+                            m[midx] = (cl_int)bcast_elseif;
+                        }
+                    }
+
+                    // calculate value for broadcasting
+                    cl_ulong number = genrand_int64(gMTdata);
+                    set_value(t[ii + i], number);
+                }
+            }
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            { // for each element in work_group
+                // calculate index as number of subgroup
+                // plus subgroup local id
+                x[j] = t[j];
+            }
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, l, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        int last_subgroup_size = 0;
+        if (non_uniform_size) ng++;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+
+                // Check result
+                if (operation == SubgroupsBroadcastOp::broadcast_first)
+                {
+                    int lowest_active_id = -1;
+                    for (i = 0; i < n; ++i)
+                    {
+
+                        lowest_active_id = i < NR_OF_ACTIVE_WORK_ITEMS
+                            ? 0
+                            : NR_OF_ACTIVE_WORK_ITEMS;
+                        //  findout if broadcasted
+                        //  value is the same
+                        tr = mx[ii + lowest_active_id];
+                        //  findout if broadcasted to all
+                        rr = my[ii + i];
+
+                        if (!compare(rr, tr))
+                        {
+                            log_error(
+                                "ERROR: sub_group_broadcast_first(%s) "
+                                "mismatch "
+                                "for local id %d in sub group %d in group "
+                                "%d\n",
+                                TypeManager<Ty>::name(), i, j, k);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < n; ++i)
+                    {
+                        if (operation == SubgroupsBroadcastOp::broadcast)
+                        {
+                            int midx = 4 * ii + 4 * i + 2;
+                            l = (int)m[midx];
+                            tr = mx[ii + l];
+                        }
+                        else
+                        {
+                            if (i < NR_OF_ACTIVE_WORK_ITEMS)
+                            { // take index of array where info
+                              // which work_item will be
+                              // broadcast its value is stored
+                                int midx = 4 * ii + 4 * i + 2;
+                                // take subgroup local id of
+                                // this work_item
+                                l = (int)m[midx];
+                                // take value generated on host
+                                // for this work_item
+                                tr = mx[ii + l];
+                            }
+                            else
+                            {
+                                int midx = 4 * ii + 4 * i + 3;
+                                l = (int)m[midx];
+                                tr = mx[ii + l];
+                            }
+                        }
+                        rr = my[ii + i]; // read device outputs for
+                                         // work_item in the subgroup
+
+                        if (!compare(rr, tr))
+                        {
+                            log_error("ERROR: sub_group_%s(%s) "
+                                      "mismatch for local id %d in sub "
+                                      "group %d in group %d - got %lu "
+                                      "expected %lu\n",
+                                      operation_names(operation),
+                                      TypeManager<Ty>::name(), i, j, k, rr, tr);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); }
+
+static subgroups::cl_half to_half(float x)
+{
+    subgroups::cl_half value;
+    value.data = cl_half_from_float(x, CL_HALF_RTE);
+    return value;
+}
+
+// for integer types
+template <typename Ty> inline Ty calculate(Ty a, Ty b, ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: return a + b;
+        case ArithmeticOp::max_: return a > b ? a : b;
+        case ArithmeticOp::min_: return a < b ? a : b;
+        case ArithmeticOp::mul_: return a * b;
+        case ArithmeticOp::and_: return a & b;
+        case ArithmeticOp::or_: return a | b;
+        case ArithmeticOp::xor_: return a ^ b;
+        case ArithmeticOp::logical_and: return a && b;
+        case ArithmeticOp::logical_or: return a || b;
+        case ArithmeticOp::logical_xor: return !a ^ !b;
+        default: log_error("Unknown operation request"); break;
+    }
+    return 0;
+}
+// Specialize for floating points.
+template <>
+inline cl_double calculate(cl_double a, cl_double b, ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: {
+            return a + b;
+        }
+        case ArithmeticOp::max_: {
+            return a > b ? a : b;
+        }
+        case ArithmeticOp::min_: {
+            return a < b ? a : b;
+        }
+        case ArithmeticOp::mul_: {
+            return a * b;
+        }
+        default: log_error("Unknown operation request"); break;
+    }
+    return 0;
+}
+
+template <>
+inline cl_float calculate(cl_float a, cl_float b, ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: {
+            return a + b;
+        }
+        case ArithmeticOp::max_: {
+            return a > b ? a : b;
+        }
+        case ArithmeticOp::min_: {
+            return a < b ? a : b;
+        }
+        case ArithmeticOp::mul_: {
+            return a * b;
+        }
+        default: log_error("Unknown operation request"); break;
+    }
+    return 0;
+}
+
+template <>
+inline subgroups::cl_half calculate(subgroups::cl_half a, subgroups::cl_half b,
+                                    ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: return to_half(to_float(a) + to_float(b));
+        case ArithmeticOp::max_:
+            return to_float(a) > to_float(b) || is_half_nan(b.data) ? a : b;
+        case ArithmeticOp::min_:
+            return to_float(a) < to_float(b) || is_half_nan(b.data) ? a : b;
+        case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b));
+        default: log_error("Unknown operation request"); break;
+    }
+    return to_half(0);
+}
+
+template <typename Ty> bool is_floating_point()
+{
+    return std::is_floating_point<Ty>::value
+        || std::is_same<Ty, subgroups::cl_half>::value;
+}
+
+template <typename Ty, ArithmeticOp operation>
+void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+{
+    int nj = (nw + ns - 1) / ns;
+
+    for (int k = 0; k < ng; ++k)
+    {
+        for (int j = 0; j < nj; ++j)
+        {
+            int ii = j * ns;
+            int n = ii + ns > nw ? nw - ii : ns;
+
+            for (int i = 0; i < n; ++i)
+            {
+                cl_ulong out_value;
+                double y;
+                if (operation == ArithmeticOp::mul_
+                    || operation == ArithmeticOp::add_)
+                {
+                    // work around to avoid overflow, do not use 0 for
+                    // multiplication
+                    out_value = (genrand_int32(gMTdata) % 4) + 1;
+                }
+                else
+                {
+                    out_value = genrand_int64(gMTdata) % (32 * n);
+                    if ((operation == ArithmeticOp::logical_and
+                         || operation == ArithmeticOp::logical_or
+                         || operation == ArithmeticOp::logical_xor)
+                        && ((out_value >> 32) & 1) == 0)
+                        out_value = 0; // increase probability of false
+                }
+                set_value(t[ii + i], out_value);
+            }
+        }
+
+        // Now map into work group using map from device
+        for (int j = 0; j < nw; ++j)
+        {
+            x[j] = t[j];
+        }
+
+        x += nw;
+        m += 4 * nw;
+    }
+}
+
+template <typename Ty, ShuffleOp operation> struct SHF
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, l, n, delta;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        int d = ns > 100 ? 100 : ns;
+        ii = 0;
+        ng = ng / nw;
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                for (i = 0; i < n; ++i)
+                {
+                    int midx = 4 * ii + 4 * i + 2;
+                    l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                        % (d > n ? n : d);
+                    switch (operation)
+                    {
+                        case ShuffleOp::shuffle:
+                        case ShuffleOp::shuffle_xor:
+                            // storing information about shuffle index
+                            m[midx] = (cl_int)l;
+                            break;
+                        case ShuffleOp::shuffle_up:
+                            delta = l; // calculate delta for shuffle up
+                            if (i - delta < 0)
+                            {
+                                delta = i;
+                            }
+                            m[midx] = (cl_int)delta;
+                            break;
+                        case ShuffleOp::shuffle_down:
+                            delta = l; // calculate delta for shuffle down
+                            if (i + delta >= n)
+                            {
+                                delta = n - 1 - i;
+                            }
+                            m[midx] = (cl_int)delta;
+                            break;
+                        default: break;
+                    }
+                    cl_ulong number = genrand_int64(gMTdata);
+                    set_value(t[ii + i], number);
+                }
+            }
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            { // for each element in work_group
+                x[j] = t[j];
+            }
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, l, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        ng = ng / nw;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                for (i = 0; i < n; ++i)
+                { // inside the subgroup
+                  // shuffle index storage
+                    int midx = 4 * ii + 4 * i + 2;
+                    l = (int)m[midx];
+                    rr = my[ii + i];
+                    switch (operation)
+                    {
+                        // shuffle basic - treat l as index
+                        case ShuffleOp::shuffle: tr = mx[ii + l]; break;
+                        // shuffle up - treat l as delta
+                        case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break;
+                        // shuffle up - treat l as delta
+                        case ShuffleOp::shuffle_down:
+                            tr = mx[ii + i + l];
+                            break;
+                        // shuffle xor - treat l as mask
+                        case ShuffleOp::shuffle_xor:
+                            tr = mx[ii + (i ^ l)];
+                            break;
+                        default: break;
+                    }
+
+                    if (!compare(rr, tr))
+                    {
+                        log_error("ERROR: sub_group_%s(%s) mismatch for "
+                                  "local id %d in sub group %d in group %d\n",
+                                  operation_names(operation),
+                                  TypeManager<Ty>::name(), i, j, k);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+template <typename Ty, ArithmeticOp operation> struct SCEX_NU
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        ng = ng / nw;
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
+                        : func_name = "sub_group_scan_exclusive";
+        log_info("  %s_%s(%s)...\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x \n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        ng = ng / nw;
+
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
+                        : func_name = "sub_group_scan_exclusive";
+
+        uint32_t use_work_items_mask;
+        // for uniform case take into consideration all workitems
+        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                std::set<int> active_work_items;
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (use_work_items_mask & check_work_item)
+                    {
+                        active_work_items.insert(i);
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    log_info("  No acitve workitems in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+                else if (active_work_items.size() == 1)
+                {
+                    log_info("  One active workitem in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+                else
+                {
+                    tr = TypeManager<Ty>::identify_limits(operation);
+                    int idx = 0;
+                    for (const int &active_work_item : active_work_items)
+                    {
+                        rr = my[ii + active_work_item];
+                        if (idx == 0) continue;
+
+                        if (!compare_ordered(rr, tr))
+                        {
+                            log_error(
+                                "ERROR: %s_%s(%s) "
+                                "mismatch for local id %d in sub group %d in "
+                                "group %d Expected: %d Obtained: %d\n",
+                                func_name.c_str(), operation_names(operation),
+                                TypeManager<Ty>::name(), i, j, k, tr, rr);
+                            return TEST_FAIL;
+                        }
+                        tr = calculate<Ty>(tr, mx[ii + active_work_item],
+                                           operation);
+                        idx++;
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+// Test for scan inclusive non uniform functions
+template <typename Ty, ArithmeticOp operation> struct SCIN_NU
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        ng = ng / nw;
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
+                        : func_name = "sub_group_scan_inclusive";
+
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        log_info("  %s_%s(%s)...\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x \n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        ng = ng / nw;
+
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
+                        : func_name = "sub_group_scan_inclusive";
+
+        uint32_t use_work_items_mask;
+        // for uniform case take into consideration all workitems
+        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        // std::bitset<32> mask32(use_work_items_mask);
+        // for (int k) mask32.count();
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                std::set<int> active_work_items;
+                int catch_frist_active = -1;
+
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (use_work_items_mask & check_work_item)
+                    {
+                        if (catch_frist_active == -1)
+                        {
+                            catch_frist_active = i;
+                        }
+                        active_work_items.insert(i);
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    log_info("  No acitve workitems in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+                else
+                {
+                    tr = TypeManager<Ty>::identify_limits(operation);
+                    for (const int &active_work_item : active_work_items)
+                    {
+                        rr = my[ii + active_work_item];
+                        if (active_work_items.size() == 1)
+                        {
+                            tr = mx[ii + catch_frist_active];
+                        }
+                        else
+                        {
+                            tr = calculate<Ty>(tr, mx[ii + active_work_item],
+                                               operation);
+                        }
+                        if (!compare_ordered<Ty>(rr, tr))
+                        {
+                            log_error(
+                                "ERROR: %s_%s(%s) "
+                                "mismatch for local id %d in sub group %d "
+                                "in "
+                                "group %d Expected: %d Obtained: %d\n",
+                                func_name.c_str(), operation_names(operation),
+                                TypeManager<Ty>::name(), active_work_item, j, k,
+                                tr, rr);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+// Test for reduce non uniform functions
+template <typename Ty, ArithmeticOp operation> struct RED_NU
+{
+
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        ng = ng / nw;
+        std::string func_name;
+
+        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
+                        : func_name = "sub_group_reduce";
+        log_info("  %s_%s(%s)...\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x \n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
+        Ty tr, rr;
+
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
+                        : func_name = "sub_group_reduce";
+
+        for (k = 0; k < ng; ++k)
+        {
+            // Map to array indexed to array indexed by local ID and sub
+            // group
+            for (j = 0; j < nw; ++j)
+            {
+                mx[j] = x[j];
+                my[j] = y[j];
+            }
+
+            uint32_t use_work_items_mask;
+            use_work_items_mask =
+                !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                std::set<int> active_work_items;
+                int catch_frist_active = -1;
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (use_work_items_mask & check_work_item)
+                    {
+                        if (catch_frist_active == -1)
+                        {
+                            catch_frist_active = i;
+                            tr = mx[ii + i];
+                            active_work_items.insert(i);
+                            continue;
+                        }
+                        active_work_items.insert(i);
+                        tr = calculate<Ty>(tr, mx[ii + i], operation);
+                    }
+                }
+
+                if (active_work_items.empty())
+                {
+                    log_info("  No acitve workitems in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+
+                for (const int &active_work_item : active_work_items)
+                {
+                    rr = my[ii + active_work_item];
+                    if (!compare_ordered<Ty>(rr, tr))
+                    {
+                        log_error("ERROR: %s_%s(%s) "
+                                  "mismatch for local id %d in sub group %d in "
+                                  "group %d Expected: %d Obtained: %d\n",
+                                  func_name.c_str(), operation_names(operation),
+                                  TypeManager<Ty>::name(), active_work_item, j,
+                                  k, tr, rr);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+#endif
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index dc49af2d25..93673b3579 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -19,13 +19,176 @@
 #include "testHarness.h"
 #include "kernelHelpers.h"
 #include "typeWrappers.h"
+#include "imageHelpers.h"
 
 #include <limits>
 #include <vector>
+#include <type_traits>
+
+#define NR_OF_ACTIVE_WORK_ITEMS 4
+
+extern MTdata gMTdata;
+
+struct WorkGroupParams
+{
+    WorkGroupParams(size_t gws, size_t lws,
+                    const std::vector<std::string> &req_ext = {},
+                    const std::vector<uint32_t> &all_wim = {})
+        : global_workgroup_size(gws), local_workgroup_size(lws),
+          required_extensions(req_ext), all_work_item_masks(all_wim)
+    {
+        subgroup_size = 0;
+        work_items_mask = 0;
+        use_core_subgroups = true;
+        dynsc = 0;
+    }
+    size_t global_workgroup_size;
+    size_t local_workgroup_size;
+    size_t subgroup_size;
+    uint32_t work_items_mask;
+    int dynsc;
+    bool use_core_subgroups;
+    std::vector<std::string> required_extensions;
+    std::vector<uint32_t> all_work_item_masks;
+};
+
+enum class SubgroupsBroadcastOp
+{
+    broadcast,
+    broadcast_first,
+    non_uniform_broadcast
+};
+
+enum class NonUniformVoteOp
+{
+    elect,
+    all,
+    any,
+    all_equal
+};
+
+enum class BallotOp
+{
+    ballot,
+    inverse_ballot,
+    ballot_bit_extract,
+    ballot_bit_count,
+    ballot_inclusive_scan,
+    ballot_exclusive_scan,
+    ballot_find_lsb,
+    ballot_find_msb,
+    eq_mask,
+    ge_mask,
+    gt_mask,
+    le_mask,
+    lt_mask,
+};
+
+enum class ShuffleOp
+{
+    shuffle,
+    shuffle_up,
+    shuffle_down,
+    shuffle_xor
+};
+
+enum class ArithmeticOp
+{
+    add_,
+    max_,
+    min_,
+    mul_,
+    and_,
+    or_,
+    xor_,
+    logical_and,
+    logical_or,
+    logical_xor
+};
+
+static const char *const operation_names(ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: return "add";
+        case ArithmeticOp::max_: return "max";
+        case ArithmeticOp::min_: return "min";
+        case ArithmeticOp::mul_: return "mul";
+        case ArithmeticOp::and_: return "and";
+        case ArithmeticOp::or_: return "or";
+        case ArithmeticOp::xor_: return "xor";
+        case ArithmeticOp::logical_and: return "logical_and";
+        case ArithmeticOp::logical_or: return "logical_or";
+        case ArithmeticOp::logical_xor: return "logical_xor";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(BallotOp operation)
+{
+    switch (operation)
+    {
+        case BallotOp::ballot: return "ballot";
+        case BallotOp::inverse_ballot: return "inverse_ballot";
+        case BallotOp::ballot_bit_extract: return "bit_extract";
+        case BallotOp::ballot_bit_count: return "bit_count";
+        case BallotOp::ballot_inclusive_scan: return "inclusive_scan";
+        case BallotOp::ballot_exclusive_scan: return "exclusive_scan";
+        case BallotOp::ballot_find_lsb: return "find_lsb";
+        case BallotOp::ballot_find_msb: return "find_msb";
+        case BallotOp::eq_mask: return "eq";
+        case BallotOp::ge_mask: return "ge";
+        case BallotOp::gt_mask: return "gt";
+        case BallotOp::le_mask: return "le";
+        case BallotOp::lt_mask: return "lt";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(ShuffleOp operation)
+{
+    switch (operation)
+    {
+        case ShuffleOp::shuffle: return "shuffle";
+        case ShuffleOp::shuffle_up: return "shuffle_up";
+        case ShuffleOp::shuffle_down: return "shuffle_down";
+        case ShuffleOp::shuffle_xor: return "shuffle_xor";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(NonUniformVoteOp operation)
+{
+    switch (operation)
+    {
+        case NonUniformVoteOp::all: return "all";
+        case NonUniformVoteOp::all_equal: return "all_equal";
+        case NonUniformVoteOp::any: return "any";
+        case NonUniformVoteOp::elect: return "elect";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(SubgroupsBroadcastOp operation)
+{
+    switch (operation)
+    {
+        case SubgroupsBroadcastOp::broadcast: return "broadcast";
+        case SubgroupsBroadcastOp::broadcast_first: return "broadcast_first";
+        case SubgroupsBroadcastOp::non_uniform_broadcast:
+            return "non_uniform_broadcast";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
 
 class subgroupsAPI {
 public:
-    subgroupsAPI(cl_platform_id platform, bool useCoreSubgroups)
+    subgroupsAPI(cl_platform_id platform, bool use_core_subgroups)
     {
         static_assert(CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE
                           == CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,
@@ -33,7 +196,7 @@ class subgroupsAPI {
         static_assert(CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE
                           == CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR,
                       "Enums have to be the same");
-        if (useCoreSubgroups)
+        if (use_core_subgroups)
         {
             _clGetKernelSubGroupInfo_ptr = &clGetKernelSubGroupInfo;
             clGetKernelSubGroupInfo_name = "clGetKernelSubGroupInfo";
@@ -56,220 +219,950 @@ class subgroupsAPI {
     clGetKernelSubGroupInfoKHR_fn _clGetKernelSubGroupInfo_ptr;
 };
 
-// Some template helpers
-template <typename Ty> struct TypeName;
-template <> struct TypeName<cl_half>
+// Need to defined custom type for vector size = 3 and half type. This is
+// because of 3-component types are otherwise indistinguishable from the
+// 4-component types, and because the half type is indistinguishable from some
+// other 16-bit type (ushort)
+namespace subgroups {
+struct cl_char3
 {
-    static const char *val() { return "half"; }
+    ::cl_char3 data;
 };
-template <> struct TypeName<cl_uint>
+struct cl_uchar3
 {
-    static const char *val() { return "uint"; }
+    ::cl_uchar3 data;
 };
-template <> struct TypeName<cl_int>
+struct cl_short3
 {
-    static const char *val() { return "int"; }
+    ::cl_short3 data;
 };
-template <> struct TypeName<cl_ulong>
+struct cl_ushort3
 {
-    static const char *val() { return "ulong"; }
+    ::cl_ushort3 data;
 };
-template <> struct TypeName<cl_long>
+struct cl_int3
 {
-    static const char *val() { return "long"; }
+    ::cl_int3 data;
 };
-template <> struct TypeName<float>
+struct cl_uint3
 {
-    static const char *val() { return "float"; }
+    ::cl_uint3 data;
 };
-template <> struct TypeName<double>
+struct cl_long3
 {
-    static const char *val() { return "double"; }
+    ::cl_long3 data;
 };
-
-template <typename Ty> struct TypeDef;
-template <> struct TypeDef<cl_half>
+struct cl_ulong3
+{
+    ::cl_ulong3 data;
+};
+struct cl_float3
+{
+    ::cl_float3 data;
+};
+struct cl_double3
 {
-    static const char *val() { return "typedef half Type;\n"; }
+    ::cl_double3 data;
 };
-template <> struct TypeDef<cl_uint>
+struct cl_half
 {
-    static const char *val() { return "typedef uint Type;\n"; }
+    ::cl_half data;
 };
-template <> struct TypeDef<cl_int>
+struct cl_half2
 {
-    static const char *val() { return "typedef int Type;\n"; }
+    ::cl_half2 data;
 };
-template <> struct TypeDef<cl_ulong>
+struct cl_half3
 {
-    static const char *val() { return "typedef ulong Type;\n"; }
+    ::cl_half3 data;
 };
-template <> struct TypeDef<cl_long>
+struct cl_half4
 {
-    static const char *val() { return "typedef long Type;\n"; }
+    ::cl_half4 data;
 };
-template <> struct TypeDef<float>
+struct cl_half8
 {
-    static const char *val() { return "typedef float Type;\n"; }
+    ::cl_half8 data;
 };
-template <> struct TypeDef<double>
+struct cl_half16
 {
-    static const char *val() { return "typedef double Type;\n"; }
+    ::cl_half16 data;
 };
+}
 
-template <typename Ty, int Which> struct TypeIdentity;
-// template <> struct TypeIdentity<cl_half,0> { static cl_half val() { return
-// (cl_half)0.0; } }; template <> struct TypeIdentity<cl_half,0> { static
-// cl_half val() { return -(cl_half)65536.0; } }; template <> struct
-// TypeIdentity<cl_half,0> { static cl_half val() { return (cl_half)65536.0; }
-// };
+static bool int64_ok(cl_device_id device)
+{
+    char profile[128];
+    int error;
 
-template <> struct TypeIdentity<cl_uint, 0>
+    error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
+                            (void *)&profile, NULL);
+    if (error)
+    {
+        log_info("clGetDeviceInfo failed with CL_DEVICE_PROFILE\n");
+        return false;
+    }
+
+    if (strcmp(profile, "EMBEDDED_PROFILE") == 0)
+        return is_extension_available(device, "cles_khr_int64");
+
+    return true;
+}
+
+static bool double_ok(cl_device_id device)
+{
+    int error;
+    cl_device_fp_config c;
+    error = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(c),
+                            (void *)&c, NULL);
+    if (error)
+    {
+        log_info("clGetDeviceInfo failed with CL_DEVICE_DOUBLE_FP_CONFIG\n");
+        return false;
+    }
+    return c != 0;
+}
+
+static bool half_ok(cl_device_id device)
+{
+    int error;
+    cl_device_fp_config c;
+    error = clGetDeviceInfo(device, CL_DEVICE_HALF_FP_CONFIG, sizeof(c),
+                            (void *)&c, NULL);
+    if (error)
+    {
+        log_info("clGetDeviceInfo failed with CL_DEVICE_HALF_FP_CONFIG\n");
+        return false;
+    }
+    return c != 0;
+}
+
+template <typename Ty> struct CommonTypeManager
 {
-    static cl_uint val() { return (cl_uint)0; }
+
+    static const char *name() { return ""; }
+    static const char *add_typedef() { return "\n"; }
+    typedef std::false_type is_vector_type;
+    typedef std::false_type is_sb_vector_size3;
+    typedef std::false_type is_sb_vector_type;
+    typedef std::false_type is_sb_scalar_type;
+    static const bool type_supported(cl_device_id) { return true; }
+    static const Ty identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return (Ty)0;
+            case ArithmeticOp::max_: return (std::numeric_limits<Ty>::min)();
+            case ArithmeticOp::min_: return (std::numeric_limits<Ty>::max)();
+            case ArithmeticOp::mul_: return (Ty)1;
+            case ArithmeticOp::and_: return (Ty)~0;
+            case ArithmeticOp::or_: return (Ty)0;
+            case ArithmeticOp::xor_: return (Ty)0;
+            default: log_error("Unknown operation request"); break;
+        }
+        return 0;
+    }
 };
-template <> struct TypeIdentity<cl_uint, 1>
+
+template <typename> struct TypeManager;
+
+template <> struct TypeManager<cl_int> : public CommonTypeManager<cl_int>
 {
-    static cl_uint val() { return (cl_uint)0; }
+    static const char *name() { return "int"; }
+    static const char *add_typedef() { return "typedef int Type;\n"; }
+    static cl_int identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return (cl_int)0;
+            case ArithmeticOp::max_:
+                return (std::numeric_limits<cl_int>::min)();
+            case ArithmeticOp::min_:
+                return (std::numeric_limits<cl_int>::max)();
+            case ArithmeticOp::mul_: return (cl_int)1;
+            case ArithmeticOp::and_: return (cl_int)~0;
+            case ArithmeticOp::or_: return (cl_int)0;
+            case ArithmeticOp::xor_: return (cl_int)0;
+            case ArithmeticOp::logical_and: return (cl_int)1;
+            case ArithmeticOp::logical_or: return (cl_int)0;
+            case ArithmeticOp::logical_xor: return (cl_int)0;
+            default: log_error("Unknown operation request"); break;
+        }
+        return 0;
+    }
 };
-template <> struct TypeIdentity<cl_uint, 2>
+template <> struct TypeManager<cl_int2> : public CommonTypeManager<cl_int2>
 {
-    static cl_uint val() { return (cl_uint)0xffffffff; }
+    static const char *name() { return "int2"; }
+    static const char *add_typedef() { return "typedef int2 Type;\n"; }
+    typedef std::true_type is_vector_type;
+    using scalar_type = cl_int;
 };
-
-template <> struct TypeIdentity<cl_int, 0>
+template <>
+struct TypeManager<subgroups::cl_int3>
+    : public CommonTypeManager<subgroups::cl_int3>
 {
-    static cl_int val() { return (cl_int)0; }
+    static const char *name() { return "int3"; }
+    static const char *add_typedef() { return "typedef int3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_int;
 };
-template <> struct TypeIdentity<cl_int, 1>
+template <> struct TypeManager<cl_int4> : public CommonTypeManager<cl_int4>
 {
-    static cl_int val() { return (cl_int)0x80000000; }
+    static const char *name() { return "int4"; }
+    static const char *add_typedef() { return "typedef int4 Type;\n"; }
+    using scalar_type = cl_int;
+    typedef std::true_type is_vector_type;
 };
-template <> struct TypeIdentity<cl_int, 2>
+template <> struct TypeManager<cl_int8> : public CommonTypeManager<cl_int8>
 {
-    static cl_int val() { return (cl_int)0x7fffffff; }
+    static const char *name() { return "int8"; }
+    static const char *add_typedef() { return "typedef int8 Type;\n"; }
+    using scalar_type = cl_int;
+    typedef std::true_type is_vector_type;
 };
-
-template <> struct TypeIdentity<cl_ulong, 0>
+template <> struct TypeManager<cl_int16> : public CommonTypeManager<cl_int16>
 {
-    static cl_ulong val() { return (cl_ulong)0; }
+    static const char *name() { return "int16"; }
+    static const char *add_typedef() { return "typedef int16 Type;\n"; }
+    using scalar_type = cl_int;
+    typedef std::true_type is_vector_type;
 };
-template <> struct TypeIdentity<cl_ulong, 1>
+// cl_uint
+template <> struct TypeManager<cl_uint> : public CommonTypeManager<cl_uint>
 {
-    static cl_ulong val() { return (cl_ulong)0; }
+    static const char *name() { return "uint"; }
+    static const char *add_typedef() { return "typedef uint Type;\n"; }
 };
-template <> struct TypeIdentity<cl_ulong, 2>
+template <> struct TypeManager<cl_uint2> : public CommonTypeManager<cl_uint2>
 {
-    static cl_ulong val() { return (cl_ulong)0xffffffffffffffffULL; }
+    static const char *name() { return "uint2"; }
+    static const char *add_typedef() { return "typedef uint2 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
 };
-
-template <> struct TypeIdentity<cl_long, 0>
+template <>
+struct TypeManager<subgroups::cl_uint3>
+    : public CommonTypeManager<subgroups::cl_uint3>
 {
-    static cl_long val() { return (cl_long)0; }
+    static const char *name() { return "uint3"; }
+    static const char *add_typedef() { return "typedef uint3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_uint;
 };
-template <> struct TypeIdentity<cl_long, 1>
+template <> struct TypeManager<cl_uint4> : public CommonTypeManager<cl_uint4>
 {
-    static cl_long val() { return (cl_long)0x8000000000000000ULL; }
+    static const char *name() { return "uint4"; }
+    static const char *add_typedef() { return "typedef uint4 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
 };
-template <> struct TypeIdentity<cl_long, 2>
+template <> struct TypeManager<cl_uint8> : public CommonTypeManager<cl_uint8>
 {
-    static cl_long val() { return (cl_long)0x7fffffffffffffffULL; }
+    static const char *name() { return "uint8"; }
+    static const char *add_typedef() { return "typedef uint8 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_uint16> : public CommonTypeManager<cl_uint16>
+{
+    static const char *name() { return "uint16"; }
+    static const char *add_typedef() { return "typedef uint16 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
+};
+// cl_short
+template <> struct TypeManager<cl_short> : public CommonTypeManager<cl_short>
+{
+    static const char *name() { return "short"; }
+    static const char *add_typedef() { return "typedef short Type;\n"; }
+};
+template <> struct TypeManager<cl_short2> : public CommonTypeManager<cl_short2>
+{
+    static const char *name() { return "short2"; }
+    static const char *add_typedef() { return "typedef short2 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_short3>
+    : public CommonTypeManager<subgroups::cl_short3>
+{
+    static const char *name() { return "short3"; }
+    static const char *add_typedef() { return "typedef short3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_short;
+};
+template <> struct TypeManager<cl_short4> : public CommonTypeManager<cl_short4>
+{
+    static const char *name() { return "short4"; }
+    static const char *add_typedef() { return "typedef short4 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_short8> : public CommonTypeManager<cl_short8>
+{
+    static const char *name() { return "short8"; }
+    static const char *add_typedef() { return "typedef short8 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_short16> : public CommonTypeManager<cl_short16>
+{
+    static const char *name() { return "short16"; }
+    static const char *add_typedef() { return "typedef short16 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+// cl_ushort
+template <> struct TypeManager<cl_ushort> : public CommonTypeManager<cl_ushort>
+{
+    static const char *name() { return "ushort"; }
+    static const char *add_typedef() { return "typedef ushort Type;\n"; }
+};
+template <>
+struct TypeManager<cl_ushort2> : public CommonTypeManager<cl_ushort2>
+{
+    static const char *name() { return "ushort2"; }
+    static const char *add_typedef() { return "typedef ushort2 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_ushort3>
+    : public CommonTypeManager<subgroups::cl_ushort3>
+{
+    static const char *name() { return "ushort3"; }
+    static const char *add_typedef() { return "typedef ushort3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_ushort;
+};
+template <>
+struct TypeManager<cl_ushort4> : public CommonTypeManager<cl_ushort4>
+{
+    static const char *name() { return "ushort4"; }
+    static const char *add_typedef() { return "typedef ushort4 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_ushort8> : public CommonTypeManager<cl_ushort8>
+{
+    static const char *name() { return "ushort8"; }
+    static const char *add_typedef() { return "typedef ushort8 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_ushort16> : public CommonTypeManager<cl_ushort16>
+{
+    static const char *name() { return "ushort16"; }
+    static const char *add_typedef() { return "typedef ushort16 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+// cl_char
+template <> struct TypeManager<cl_char> : public CommonTypeManager<cl_char>
+{
+    static const char *name() { return "char"; }
+    static const char *add_typedef() { return "typedef char Type;\n"; }
+};
+template <> struct TypeManager<cl_char2> : public CommonTypeManager<cl_char2>
+{
+    static const char *name() { return "char2"; }
+    static const char *add_typedef() { return "typedef char2 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_char3>
+    : public CommonTypeManager<subgroups::cl_char3>
+{
+    static const char *name() { return "char3"; }
+    static const char *add_typedef() { return "typedef char3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_char;
+};
+template <> struct TypeManager<cl_char4> : public CommonTypeManager<cl_char4>
+{
+    static const char *name() { return "char4"; }
+    static const char *add_typedef() { return "typedef char4 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_char8> : public CommonTypeManager<cl_char8>
+{
+    static const char *name() { return "char8"; }
+    static const char *add_typedef() { return "typedef char8 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_char16> : public CommonTypeManager<cl_char16>
+{
+    static const char *name() { return "char16"; }
+    static const char *add_typedef() { return "typedef char16 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+// cl_uchar
+template <> struct TypeManager<cl_uchar> : public CommonTypeManager<cl_uchar>
+{
+    static const char *name() { return "uchar"; }
+    static const char *add_typedef() { return "typedef uchar Type;\n"; }
+};
+template <> struct TypeManager<cl_uchar2> : public CommonTypeManager<cl_uchar2>
+{
+    static const char *name() { return "uchar2"; }
+    static const char *add_typedef() { return "typedef uchar2 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_uchar3>
+    : public CommonTypeManager<subgroups::cl_char3>
+{
+    static const char *name() { return "uchar3"; }
+    static const char *add_typedef() { return "typedef uchar3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_uchar;
+};
+template <> struct TypeManager<cl_uchar4> : public CommonTypeManager<cl_uchar4>
+{
+    static const char *name() { return "uchar4"; }
+    static const char *add_typedef() { return "typedef uchar4 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_uchar8> : public CommonTypeManager<cl_uchar8>
+{
+    static const char *name() { return "uchar8"; }
+    static const char *add_typedef() { return "typedef uchar8 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_uchar16> : public CommonTypeManager<cl_uchar16>
+{
+    static const char *name() { return "uchar16"; }
+    static const char *add_typedef() { return "typedef uchar16 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+// cl_long
+template <> struct TypeManager<cl_long> : public CommonTypeManager<cl_long>
+{
+    static const char *name() { return "long"; }
+    static const char *add_typedef() { return "typedef long Type;\n"; }
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long2> : public CommonTypeManager<cl_long2>
+{
+    static const char *name() { return "long2"; }
+    static const char *add_typedef() { return "typedef long2 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_long3>
+    : public CommonTypeManager<subgroups::cl_long3>
+{
+    static const char *name() { return "long3"; }
+    static const char *add_typedef() { return "typedef long3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_long;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long4> : public CommonTypeManager<cl_long4>
+{
+    static const char *name() { return "long4"; }
+    static const char *add_typedef() { return "typedef long4 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long8> : public CommonTypeManager<cl_long8>
+{
+    static const char *name() { return "long8"; }
+    static const char *add_typedef() { return "typedef long8 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long16> : public CommonTypeManager<cl_long16>
+{
+    static const char *name() { return "long16"; }
+    static const char *add_typedef() { return "typedef long16 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+// cl_ulong
+template <> struct TypeManager<cl_ulong> : public CommonTypeManager<cl_ulong>
+{
+    static const char *name() { return "ulong"; }
+    static const char *add_typedef() { return "typedef ulong Type;\n"; }
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_ulong2> : public CommonTypeManager<cl_ulong2>
+{
+    static const char *name() { return "ulong2"; }
+    static const char *add_typedef() { return "typedef ulong2 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_ulong3>
+    : public CommonTypeManager<subgroups::cl_ulong3>
+{
+    static const char *name() { return "ulong3"; }
+    static const char *add_typedef() { return "typedef ulong3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_ulong;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_ulong4> : public CommonTypeManager<cl_ulong4>
+{
+    static const char *name() { return "ulong4"; }
+    static const char *add_typedef() { return "typedef ulong4 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_ulong8> : public CommonTypeManager<cl_ulong8>
+{
+    static const char *name() { return "ulong8"; }
+    static const char *add_typedef() { return "typedef ulong8 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_ulong16> : public CommonTypeManager<cl_ulong16>
+{
+    static const char *name() { return "ulong16"; }
+    static const char *add_typedef() { return "typedef ulong16 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
 };
 
-
-template <> struct TypeIdentity<float, 0>
+// cl_float
+template <> struct TypeManager<cl_float> : public CommonTypeManager<cl_float>
 {
-    static float val() { return 0.F; }
+    static const char *name() { return "float"; }
+    static const char *add_typedef() { return "typedef float Type;\n"; }
+    static cl_float identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return 0.0f;
+            case ArithmeticOp::max_:
+                return -std::numeric_limits<float>::infinity();
+            case ArithmeticOp::min_:
+                return std::numeric_limits<float>::infinity();
+            case ArithmeticOp::mul_: return (cl_float)1;
+            default: log_error("Unknown operation request"); break;
+        }
+        return 0;
+    }
 };
-template <> struct TypeIdentity<float, 1>
+template <> struct TypeManager<cl_float2> : public CommonTypeManager<cl_float2>
 {
-    static float val() { return -std::numeric_limits<float>::infinity(); }
+    static const char *name() { return "float2"; }
+    static const char *add_typedef() { return "typedef float2 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
 };
-template <> struct TypeIdentity<float, 2>
+template <>
+struct TypeManager<subgroups::cl_float3>
+    : public CommonTypeManager<subgroups::cl_float3>
 {
-    static float val() { return std::numeric_limits<float>::infinity(); }
+    static const char *name() { return "float3"; }
+    static const char *add_typedef() { return "typedef float3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_float;
+};
+template <> struct TypeManager<cl_float4> : public CommonTypeManager<cl_float4>
+{
+    static const char *name() { return "float4"; }
+    static const char *add_typedef() { return "typedef float4 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_float8> : public CommonTypeManager<cl_float8>
+{
+    static const char *name() { return "float8"; }
+    static const char *add_typedef() { return "typedef float8 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_float16> : public CommonTypeManager<cl_float16>
+{
+    static const char *name() { return "float16"; }
+    static const char *add_typedef() { return "typedef float16 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
 };
 
-template <> struct TypeIdentity<double, 0>
+// cl_double
+template <> struct TypeManager<cl_double> : public CommonTypeManager<cl_double>
+{
+    static const char *name() { return "double"; }
+    static const char *add_typedef() { return "typedef double Type;\n"; }
+    static cl_double identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return 0.0;
+            case ArithmeticOp::max_:
+                return -std::numeric_limits<double>::infinity();
+            case ArithmeticOp::min_:
+                return std::numeric_limits<double>::infinity();
+            case ArithmeticOp::mul_: return (cl_double)1;
+            default: log_error("Unknown operation request"); break;
+        }
+        return 0;
+    }
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double2> : public CommonTypeManager<cl_double2>
+{
+    static const char *name() { return "double2"; }
+    static const char *add_typedef() { return "typedef double2 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_double3>
+    : public CommonTypeManager<subgroups::cl_double3>
+{
+    static const char *name() { return "double3"; }
+    static const char *add_typedef() { return "typedef double3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_double;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double4> : public CommonTypeManager<cl_double4>
 {
-    static double val() { return 0.L; }
+    static const char *name() { return "double4"; }
+    static const char *add_typedef() { return "typedef double4 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double8> : public CommonTypeManager<cl_double8>
+{
+    static const char *name() { return "double8"; }
+    static const char *add_typedef() { return "typedef double8 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double16> : public CommonTypeManager<cl_double16>
+{
+    static const char *name() { return "double16"; }
+    static const char *add_typedef() { return "typedef double16 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
 };
 
-template <> struct TypeIdentity<double, 1>
+// cl_half
+template <>
+struct TypeManager<subgroups::cl_half>
+    : public CommonTypeManager<subgroups::cl_half>
 {
-    static double val() { return -std::numeric_limits<double>::infinity(); }
+    static const char *name() { return "half"; }
+    static const char *add_typedef() { return "typedef half Type;\n"; }
+    typedef std::true_type is_sb_scalar_type;
+    static subgroups::cl_half identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return { 0x0000 };
+            case ArithmeticOp::max_: return { 0xfc00 };
+            case ArithmeticOp::min_: return { 0x7c00 };
+            case ArithmeticOp::mul_: return { 0x3c00 };
+            default: log_error("Unknown operation request"); break;
+        }
+        return { 0 };
+    }
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
 };
-template <> struct TypeIdentity<double, 2>
+template <>
+struct TypeManager<subgroups::cl_half2>
+    : public CommonTypeManager<subgroups::cl_half2>
 {
-    static double val() { return std::numeric_limits<double>::infinity(); }
+    static const char *name() { return "half2"; }
+    static const char *add_typedef() { return "typedef half2 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
 };
+template <>
+struct TypeManager<subgroups::cl_half3>
+    : public CommonTypeManager<subgroups::cl_half3>
+{
+    static const char *name() { return "half3"; }
+    static const char *add_typedef() { return "typedef half3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = subgroups::cl_half;
 
-template <typename Ty> struct TypeCheck;
-template <> struct TypeCheck<cl_uint>
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_half4>
+    : public CommonTypeManager<subgroups::cl_half4>
 {
-    static bool val(cl_device_id) { return true; }
+    static const char *name() { return "half4"; }
+    static const char *add_typedef() { return "typedef half4 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
 };
-template <> struct TypeCheck<cl_int>
+template <>
+struct TypeManager<subgroups::cl_half8>
+    : public CommonTypeManager<subgroups::cl_half8>
 {
-    static bool val(cl_device_id) { return true; }
+    static const char *name() { return "half8"; }
+    static const char *add_typedef() { return "typedef half8 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_half16>
+    : public CommonTypeManager<subgroups::cl_half16>
+{
+    static const char *name() { return "half16"; }
+    static const char *add_typedef() { return "typedef half16 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
 };
 
-static bool int64_ok(cl_device_id device)
+// set scalar value to vector of halfs
+template <typename Ty, int N = 0>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_type::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
 {
-    char profile[128];
-    int error;
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
+    {
+        lhs.data.s[i] = rhs;
+    }
+}
 
-    error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
-                            (void *)&profile, NULL);
-    if (error)
+
+// set scalar value to vector
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_vector_type::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
+{
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
     {
-        log_info("clGetDeviceInfo failed with CL_DEVICE_PROFILE\n");
-        return false;
+        lhs.s[i] = rhs;
     }
+}
 
-    if (strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        return is_extension_available(device, "cles_khr_int64");
+// set vector to vector value
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_vector_type::value>::type
+set_value(Ty &lhs, const Ty &rhs)
+{
+    lhs = rhs;
+}
 
-    return true;
+// set scalar value to vector size 3
+template <typename Ty, int N = 0>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_size3::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
+{
+    for (auto i = 0; i < 3; ++i)
+    {
+        lhs.data.s[i] = rhs;
+    }
 }
 
-template <> struct TypeCheck<cl_ulong>
+// set scalar value to scalar
+template <typename Ty>
+typename std::enable_if<std::is_scalar<Ty>::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
 {
-    static bool val(cl_device_id device) { return int64_ok(device); }
-};
-template <> struct TypeCheck<cl_long>
+    lhs = static_cast<Ty>(rhs);
+}
+
+// set scalar value to half scalar
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_scalar_type::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
 {
-    static bool val(cl_device_id device) { return int64_ok(device); }
-};
-template <> struct TypeCheck<cl_float>
+    lhs.data = rhs;
+}
+
+// compare for common vectors
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_vector_type::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
 {
-    static bool val(cl_device_id) { return true; }
-};
-template <> struct TypeCheck<cl_half>
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
+    {
+        if (lhs.s[i] != rhs.s[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+// compare for vectors 3
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_size3::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
 {
-    static bool val(cl_device_id device)
+    for (auto i = 0; i < 3; ++i)
     {
-        return is_extension_available(device, "cl_khr_fp16");
+        if (lhs.data.s[i] != rhs.data.s[i])
+        {
+            return false;
+        }
     }
-};
-template <> struct TypeCheck<double>
+    return true;
+}
+
+// compare for half vectors
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_type::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
 {
-    static bool val(cl_device_id device)
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
     {
-        int error;
-        cl_device_fp_config c;
-        error = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(c),
-                                (void *)&c, NULL);
-        if (error)
+        if (lhs.data.s[i] != rhs.data.s[i])
         {
-            log_info(
-                "clGetDeviceInfo failed with CL_DEVICE_DOUBLE_FP_CONFIG\n");
             return false;
         }
-        return c != 0;
     }
-};
+    return true;
+}
+
+// compare for scalars
+template <typename Ty>
+typename std::enable_if<std::is_scalar<Ty>::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
+{
+    return lhs == rhs;
+}
+
+// compare for scalar halfs
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_scalar_type::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
+{
+    return lhs.data == rhs.data;
+}
+
+template <typename Ty> inline bool compare_ordered(const Ty &lhs, const Ty &rhs)
+{
+    return lhs == rhs;
+}
+
+template <>
+inline bool compare_ordered(const subgroups::cl_half &lhs,
+                            const subgroups::cl_half &rhs)
+{
+    return cl_half_to_float(lhs.data) == cl_half_to_float(rhs.data);
+}
 
+template <typename Ty>
+inline bool compare_ordered(const subgroups::cl_half &lhs, const int &rhs)
+{
+    return cl_half_to_float(lhs.data) == rhs;
+}
 
 // Run a test kernel to compute the result of a built-in on an input
 static int run_kernel(cl_context context, cl_command_queue queue,
@@ -318,6 +1211,9 @@ static int run_kernel(cl_context context, cl_command_queue queue,
                                  NULL);
     test_error(error, "clEnqueueWriteBuffer failed");
 
+    error = clEnqueueWriteBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL,
+                                 NULL);
+    test_error(error, "clEnqueueWriteBuffer failed");
     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
                                    NULL, NULL);
     test_error(error, "clEnqueueNDRangeKernel failed");
@@ -337,42 +1233,93 @@ static int run_kernel(cl_context context, cl_command_queue queue,
 }
 
 // Driver for testing a single built in function
-template <typename Ty, typename Fns, size_t GSIZE, size_t LSIZE,
-          size_t TSIZE = 0>
-struct test
+template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
 {
+    static int mrun(cl_device_id device, cl_context context,
+                    cl_command_queue queue, int num_elements, const char *kname,
+                    const char *src, WorkGroupParams test_params)
+    {
+        int error = TEST_PASS;
+        for (auto &mask : test_params.all_work_item_masks)
+        {
+            test_params.work_items_mask = mask;
+            error |= run(device, context, queue, num_elements, kname, src,
+                         test_params);
+        }
+        return error;
+    };
     static int run(cl_device_id device, cl_context context,
                    cl_command_queue queue, int num_elements, const char *kname,
-                   const char *src, int dynscl, bool useCoreSubgroups)
+                   const char *src, WorkGroupParams test_params)
     {
         size_t tmp;
         int error;
         int subgroup_size, num_subgroups;
         size_t realSize;
-        size_t global;
-        size_t local;
+        size_t global = test_params.global_workgroup_size;
+        size_t local = test_params.local_workgroup_size;
         clProgramWrapper program;
         clKernelWrapper kernel;
         cl_platform_id platform;
-        cl_int sgmap[2 * GSIZE];
-        Ty mapin[LSIZE];
-        Ty mapout[LSIZE];
+        std::vector<cl_int> sgmap;
+        sgmap.resize(4 * global);
+        std::vector<Ty> mapin;
+        mapin.resize(local);
+        std::vector<Ty> mapout;
+        mapout.resize(local);
+        std::stringstream kernel_sstr;
+        if (test_params.work_items_mask != 0)
+        {
+            kernel_sstr << "#define WORK_ITEMS_MASK ";
+            kernel_sstr << "0x" << std::hex << test_params.work_items_mask
+                        << "\n";
+        }
+
 
+        kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS ";
+        kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n";
         // Make sure a test of type Ty is supported by the device
-        if (!TypeCheck<Ty>::val(device)) return 0;
+        if (!TypeManager<Ty>::type_supported(device))
+        {
+            log_info("Data type not supported : %s\n", TypeManager<Ty>::name());
+            return 0;
+        }
+        else
+        {
+            if (strstr(TypeManager<Ty>::name(), "double"))
+            {
+                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
+            }
+            else if (strstr(TypeManager<Ty>::name(), "half"))
+            {
+                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n";
+            }
+        }
+
+        for (std::string extension : test_params.required_extensions)
+        {
+            if (!is_extension_available(device, extension.c_str()))
+            {
+                log_info("The extension %s not supported on this device. SKIP "
+                         "testing - kernel %s data type %s\n",
+                         extension.c_str(), kname, TypeManager<Ty>::name());
+                return TEST_PASS;
+            }
+            kernel_sstr << "#pragma OPENCL EXTENSION " + extension
+                    + ": enable\n";
+        }
 
         error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
                                 (void *)&platform, NULL);
         test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
-        std::stringstream kernel_sstr;
-        if (useCoreSubgroups)
+        if (test_params.use_core_subgroups)
         {
             kernel_sstr
                 << "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
         }
         kernel_sstr << "#define XY(M,I) M[I].x = get_sub_group_local_id(); "
                        "M[I].y = get_sub_group_id();\n";
-        kernel_sstr << TypeDef<Ty>::val();
+        kernel_sstr << TypeManager<Ty>::add_typedef();
         kernel_sstr << src;
         const std::string &kernel_str = kernel_sstr.str();
         const char *kernel_src = kernel_str.c_str();
@@ -382,16 +1329,18 @@ struct test
         if (error != 0) return error;
 
         // Determine some local dimensions to use for the test.
-        global = GSIZE;
-        error = get_max_common_work_group_size(context, kernel, GSIZE, &local);
+        error = get_max_common_work_group_size(
+            context, kernel, test_params.global_workgroup_size, &local);
         test_error(error, "get_max_common_work_group_size failed");
 
         // Limit it a bit so we have muliple work groups
-        // Ideally this will still be large enough to give us multiple subgroups
-        if (local > LSIZE) local = LSIZE;
+        // Ideally this will still be large enough to give us multiple
+        if (local > test_params.local_workgroup_size)
+            local = test_params.local_workgroup_size;
+
 
         // Get the sub group info
-        subgroupsAPI subgroupsApiSet(platform, useCoreSubgroups);
+        subgroupsAPI subgroupsApiSet(platform, test_params.use_core_subgroups);
         clGetKernelSubGroupInfoKHR_fn clGetKernelSubGroupInfo_ptr =
             subgroupsApiSet.clGetKernelSubGroupInfo_ptr();
         if (clGetKernelSubGroupInfo_ptr == NULL)
@@ -435,8 +1384,9 @@ struct test
 
         std::vector<Ty> idata;
         std::vector<Ty> odata;
-        size_t input_array_size = GSIZE;
-        size_t output_array_size = GSIZE;
+        size_t input_array_size = global;
+        size_t output_array_size = global;
+        int dynscl = test_params.dynsc;
 
         if (dynscl != 0)
         {
@@ -449,28 +1399,96 @@ struct test
         odata.resize(output_array_size);
 
         // Run the kernel once on zeroes to get the map
-        memset(&idata[0], 0, input_array_size * sizeof(Ty));
-        error = run_kernel(context, queue, kernel, global, local, &idata[0],
-                           input_array_size * sizeof(Ty), sgmap,
-                           global * sizeof(cl_int) * 2, &odata[0],
+        memset(idata.data(), 0, input_array_size * sizeof(Ty));
+        error = run_kernel(context, queue, kernel, global, local, idata.data(),
+                           input_array_size * sizeof(Ty), sgmap.data(),
+                           global * sizeof(cl_int4), odata.data(),
                            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        if (error) return error;
+        test_error(error, "Running kernel first time failed");
 
         // Generate the desired input for the kernel
-        Fns::gen(&idata[0], mapin, sgmap, subgroup_size, (int)local,
-                 (int)global / (int)local);
 
-        error = run_kernel(context, queue, kernel, global, local, &idata[0],
-                           input_array_size * sizeof(Ty), sgmap,
-                           global * sizeof(cl_int) * 2, &odata[0],
+        test_params.subgroup_size = subgroup_size;
+        Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params);
+        error = run_kernel(context, queue, kernel, global, local, idata.data(),
+                           input_array_size * sizeof(Ty), sgmap.data(),
+                           global * sizeof(cl_int4), odata.data(),
                            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        if (error) return error;
-
+        test_error(error, "Running kernel second time failed");
 
         // Check the result
-        return Fns::chk(&idata[0], &odata[0], mapin, mapout, sgmap,
-                        subgroup_size, (int)local, (int)global / (int)local);
+        error = Fns::chk(idata.data(), odata.data(), mapin.data(),
+                         mapout.data(), sgmap.data(), test_params);
+        test_error(error, "Data verification failed");
+        return TEST_PASS;
     }
 };
 
+static void set_last_workgroup_params(int non_uniform_size,
+                                      int &number_of_subgroups,
+                                      int subgroup_size, int &workgroup_size,
+                                      int &last_subgroup_size)
+{
+    number_of_subgroups = 1 + non_uniform_size / subgroup_size;
+    last_subgroup_size = non_uniform_size % subgroup_size;
+    workgroup_size = non_uniform_size;
+}
+
+template <typename Ty>
+static void set_randomdata_for_subgroup(Ty *workgroup, int wg_offset,
+                                        int current_sbs)
+{
+    int randomize_data = (int)(genrand_int32(gMTdata) % 3);
+    // Initialize data matrix indexed by local id and sub group id
+    switch (randomize_data)
+    {
+        case 0:
+            memset(&workgroup[wg_offset], 0, current_sbs * sizeof(Ty));
+            break;
+        case 1: {
+            memset(&workgroup[wg_offset], 0, current_sbs * sizeof(Ty));
+            int wi_id = (int)(genrand_int32(gMTdata) % (cl_uint)current_sbs);
+            set_value(workgroup[wg_offset + wi_id], 41);
+        }
+        break;
+        case 2:
+            memset(&workgroup[wg_offset], 0xff, current_sbs * sizeof(Ty));
+            break;
+    }
+}
+
+struct RunTestForType
+{
+    RunTestForType(cl_device_id device, cl_context context,
+                   cl_command_queue queue, int num_elements,
+                   WorkGroupParams test_params)
+        : device_(device), context_(context), queue_(queue),
+          num_elements_(num_elements), test_params_(test_params)
+    {}
+    template <typename T, typename U>
+    int run_impl(const char *kernel_name, const char *source)
+    {
+        int error = TEST_PASS;
+        if (test_params_.all_work_item_masks.size() > 0)
+        {
+            error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
+                                     kernel_name, source, test_params_);
+        }
+        else
+        {
+            error = test<T, U>::run(device_, context_, queue_, num_elements_,
+                                    kernel_name, source, test_params_);
+        }
+
+        return error;
+    }
+
+private:
+    cl_device_id device_;
+    cl_context context_;
+    cl_command_queue queue_;
+    int num_elements_;
+    WorkGroupParams test_params_;
+};
+
 #endif
diff --git a/test_conformance/subgroups/test_barrier.cpp b/test_conformance/subgroups/test_barrier.cpp
index e6ce1d2ecd..47e42f65af 100644
--- a/test_conformance/subgroups/test_barrier.cpp
+++ b/test_conformance/subgroups/test_barrier.cpp
@@ -59,10 +59,15 @@ static const char *gbar_source =
 // barrier test functions
 template <int Which> struct BAR
 {
-    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
+    static void gen(cl_int *x, cl_int *t, cl_int *m,
+                    const WorkGroupParams &test_params)
     {
         int i, ii, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
         int e;
 
         ii = 0;
@@ -79,8 +84,7 @@ template <int Which> struct BAR
             // Now map into work group using map from device
             for (j = 0; j < nw; ++j)
             {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
+                x[j] = t[j];
             }
 
             x += nw;
@@ -89,10 +93,14 @@ template <int Which> struct BAR
     }
 
     static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   int ns, int nw, int ng)
+                   const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
         cl_int tr, rr;
 
         if (Which == 0)
@@ -105,9 +113,8 @@ template <int Which> struct BAR
             // Map to array indexed to array indexed by local ID and sub group
             for (j = 0; j < nw; ++j)
             {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
+                mx[j] = x[j];
+                my[j] = y[j];
             }
 
             for (j = 0; j < nj; ++j)
@@ -123,8 +130,9 @@ template <int Which> struct BAR
                     if (tr != rr)
                     {
                         log_error("ERROR: sub_group_barrier mismatch for local "
-                                  "id %d in sub group %d in group %d\n",
-                                  i, j, k);
+                                  "id %d in sub group %d in group %d expected "
+                                  "%d got %d\n",
+                                  i, j, k, tr, rr);
                         return -1;
                     }
                 }
@@ -144,18 +152,18 @@ int test_barrier_functions(cl_device_id device, cl_context context,
                            cl_command_queue queue, int num_elements,
                            bool useCoreSubgroups)
 {
-    int error;
+    int error = TEST_PASS;
 
     // Adjust these individually below if desired/needed
-#define G 2000
-#define L 200
-
-    error = test<cl_int, BAR<0>, G, L>::run(device, context, queue,
-                                            num_elements, "test_lbar",
-                                            lbar_source, 0, useCoreSubgroups);
-    error = test<cl_int, BAR<1>, G, L, G>::run(
-        device, context, queue, num_elements, "test_gbar", gbar_source, 0,
-        useCoreSubgroups);
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.use_core_subgroups = useCoreSubgroups;
+    error = test<cl_int, BAR<0>>::run(device, context, queue, num_elements,
+                                      "test_lbar", lbar_source, test_params);
+    error |= test<cl_int, BAR<1>, global_work_size>::run(
+        device, context, queue, num_elements, "test_gbar", gbar_source,
+        test_params);
 
     return error;
 }
diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp
index 02850e5f7d..428f2cdcde 100644
--- a/test_conformance/subgroups/test_ifp.cpp
+++ b/test_conformance/subgroups/test_ifp.cpp
@@ -46,7 +46,7 @@ static const char *ifp_source =
     "#define INST_COUNT 0x3\n"
     "\n"
     "__kernel void\n"
-    "test_ifp(const __global int *in, __global int2 *xy, __global int *out)\n"
+    "test_ifp(const __global int *in, __global int4 *xy, __global int *out)\n"
     "{\n"
     "    __local atomic_int loc[NUM_LOC];\n"
     "\n"
@@ -225,10 +225,15 @@ void run_insts(cl_int *x, cl_int *p, int n)
 
 struct IFP
 {
-    static void gen(cl_int *x, cl_int *t, cl_int *, int ns, int nw, int ng)
+    static void gen(cl_int *x, cl_int *t, cl_int *,
+                    const WorkGroupParams &test_params)
     {
         int k;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
 
         // We need at least 2 sub groups per group for this test
         if (nj == 1) return;
@@ -240,11 +245,15 @@ struct IFP
         }
     }
 
-    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, int ns,
-                   int nw, int ng)
+    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
+                   const WorkGroupParams &test_params)
     {
         int i, k;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
 
         // We need at least 2 sub groups per group for this tes
         if (nj == 1) return 0;
@@ -275,14 +284,17 @@ struct IFP
 int test_ifp(cl_device_id device, cl_context context, cl_command_queue queue,
              int num_elements, bool useCoreSubgroups)
 {
-    int error;
+    int error = TEST_PASS;
 
+    // Global/local work group sizes
     // Adjust these individually below if desired/needed
-#define G 2000
-#define L 200
-    error = test<cl_int, IFP, G, L>::run(device, context, queue, num_elements,
-                                         "test_ifp", ifp_source, NUM_LOC + 1,
-                                         useCoreSubgroups);
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.use_core_subgroups = useCoreSubgroups;
+    test_params.dynsc = NUM_LOC + 1;
+    error = test<cl_int, IFP>::run(device, context, queue, num_elements,
+                                   "test_ifp", ifp_source, test_params);
     return error;
 }
 
diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp
new file mode 100644
index 0000000000..c0e4952408
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup.cpp
@@ -0,0 +1,217 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_kernels.h"
+#include "subgroup_common_templates.h"
+#include "harness/conversions.h"
+#include "harness/typeWrappers.h"
+
+namespace {
+// Any/All test functions
+template <NonUniformVoteOp operation> struct AA
+{
+    static void gen(cl_int *x, cl_int *t, cl_int *m,
+                    const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        int e;
+        ng = ng / nw;
+        ii = 0;
+        log_info("  sub_group_%s...\n", operation_names(operation));
+        for (k = 0; k < ng; ++k)
+        {
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                e = (int)(genrand_int32(gMTdata) % 3);
+
+                // Initialize data matrix indexed by local id and sub group id
+                switch (e)
+                {
+                    case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
+                    case 1:
+                        memset(&t[ii], 0, n * sizeof(cl_int));
+                        i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
+                        t[ii + i] = 41;
+                        break;
+                    case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
+                }
+            }
+
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            {
+                x[j] = t[j];
+            }
+
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        cl_int taa, raa;
+        ng = ng / nw;
+
+        for (k = 0; k < ng; ++k)
+        {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j = 0; j < nw; ++j)
+            {
+                mx[j] = x[j];
+                my[j] = y[j];
+            }
+
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                // Compute target
+                if (operation == NonUniformVoteOp::any)
+                {
+                    taa = 0;
+                    for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
+                }
+
+                if (operation == NonUniformVoteOp::all)
+                {
+                    taa = 1;
+                    for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
+                }
+
+                // Check result
+                for (i = 0; i < n; ++i)
+                {
+                    raa = my[ii + i] != 0;
+                    if (raa != taa)
+                    {
+                        log_error("ERROR: sub_group_%s mismatch for local id "
+                                  "%d in sub group %d in group %d\n",
+                                  operation_names(operation), i, j, k);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_%s... passed\n", operation_names(operation));
+        return TEST_PASS;
+    }
+};
+
+static const char *any_source = "__kernel void test_any(const __global Type "
+                                "*in, __global int4 *xy, __global Type *out)\n"
+                                "{\n"
+                                "    int gid = get_global_id(0);\n"
+                                "    XY(xy,gid);\n"
+                                "    out[gid] = sub_group_any(in[gid]);\n"
+                                "}\n";
+
+static const char *all_source = "__kernel void test_all(const __global Type "
+                                "*in, __global int4 *xy, __global Type *out)\n"
+                                "{\n"
+                                "    int gid = get_global_id(0);\n"
+                                "    XY(xy,gid);\n"
+                                "    out[gid] = sub_group_all(in[gid]);\n"
+                                "}\n";
+
+
+template <typename T>
+int run_broadcast_scan_reduction_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
+        "test_bcast", bcast_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
+                                                            redadd_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
+                                                            redmax_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
+                                                            redmin_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
+                                                             scinadd_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
+                                                             scinmax_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
+                                                             scinmin_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
+                                                             scexadd_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
+                                                             scexmax_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
+                                                             scexmin_source);
+    return error;
+}
+
+}
+// Entry point from main
+int test_subgroup_functions(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int num_elements,
+                            bool useCoreSubgroups)
+{
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+    int error =
+        rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
+    error |=
+        rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
+    error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_ulong>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_float>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_double>(rft);
+    error |= run_broadcast_scan_reduction_for_type<subgroups::cl_half>(rft);
+    return error;
+}
+
+int test_subgroup_functions_core(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements)
+{
+    return test_subgroup_functions(device, context, queue, num_elements, true);
+}
+
+int test_subgroup_functions_ext(cl_device_id device, cl_context context,
+                                cl_command_queue queue, int num_elements)
+{
+    bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
+
+    if (!hasExtension)
+    {
+        log_info(
+            "Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    return test_subgroup_functions(device, context, queue, num_elements, false);
+}
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
new file mode 100644
index 0000000000..f2e4060b0b
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -0,0 +1,1089 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+#include <bitset>
+
+namespace {
+// Test for ballot functions
+template <typename Ty> struct BALLOT
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        // no work here
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int non_uniform_size = gws % lws;
+        log_info("  sub_group_ballot...\n");
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int current_sbs = 0;
+        cl_uint expected_result, device_result;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        wg_number = non_uniform_size ? wg_number + 1 : wg_number;
+        int last_subgroup_size = 0;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                // read device outputs for work_group
+                my[wi_id] = y[wi_id];
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                {
+                    device_result = my[wg_offset + wi_id];
+                    expected_result = 1;
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error(
+                            "ERROR: sub_group_ballot mismatch for local id "
+                            "%d in sub group %d in group %d obtained {%d}, "
+                            "expected {%d} \n",
+                            wi_id, sb_id, wg_id, device_result,
+                            expected_result);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  sub_group_ballot... passed\n");
+        return TEST_PASS;
+    }
+};
+
+// Test for bit extract ballot functions
+template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int wi_id, sb_id, wg_id, l;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int wg_number = gws / lws;
+        int limit_sbs = sbs > 100 ? 100 : sbs;
+        int non_uniform_size = gws % lws;
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                // rand index to bit extract
+                int index_for_odd = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                    % (limit_sbs > current_sbs ? current_sbs : limit_sbs);
+                int index_for_even = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                    % (limit_sbs > current_sbs ? current_sbs : limit_sbs);
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                {
+                    // index of the third element int the vector.
+                    int midx = 4 * wg_offset + 4 * wi_id + 2;
+                    // storing information about index to bit extract
+                    m[midx] = (cl_int)index_for_odd;
+                    m[++midx] = (cl_int)index_for_even;
+                }
+                set_randomdata_for_subgroup<Ty>(t, wg_offset, current_sbs);
+            }
+
+            // Now map into work group using map from device
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+
+            x += lws;
+            m += 4 * lws;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, l, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int wg_number = gws / lws;
+        cl_uint4 expected_result, device_result;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+        int non_uniform_size = gws % lws;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            // Map to array indexed to array indexed by local ID and sub group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                // read host inputs for work_group
+                mx[wi_id] = x[wi_id];
+                // read device outputs for work_group
+                my[wi_id] = y[wi_id];
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                // take index of array where info which work_item will
+                // be broadcast its value is stored
+                int midx = 4 * wg_offset + 2;
+                // take subgroup local id of this work_item
+                int index_for_odd = (int)m[midx];
+                int index_for_even = (int)m[++midx];
+
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // for each subgroup
+                    int bit_value = 0;
+                    // from which value of bitfield bit
+                    // verification will be done
+                    int take_shift =
+                        (wi_id & 1) ? index_for_odd % 32 : index_for_even % 32;
+                    int bit_mask = 1 << take_shift;
+
+                    if (wi_id < 32)
+                        (mx[wg_offset + wi_id].s0 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+                    if (wi_id >= 32 && wi_id < 64)
+                        (mx[wg_offset + wi_id].s1 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+                    if (wi_id >= 64 && wi_id < 96)
+                        (mx[wg_offset + wi_id].s2 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+                    if (wi_id >= 96 && wi_id < 128)
+                        (mx[wg_offset + wi_id].s3 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+
+                    if (wi_id & 1)
+                    {
+                        bit_value ? expected_result = { 1, 0, 0, 1 }
+                                  : expected_result = { 0, 0, 0, 1 };
+                    }
+                    else
+                    {
+                        bit_value ? expected_result = { 1, 0, 0, 2 }
+                                  : expected_result = { 0, 0, 0, 2 };
+                    }
+
+                    device_result = my[wg_offset + wi_id];
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error(
+                            "ERROR: sub_group_%s mismatch for local id %d in "
+                            "sub group %d in group %d obtained {%d, %d, %d, "
+                            "%d}, expected {%d, %d, %d, %d}\n",
+                            operation_names(operation), wi_id, sb_id, wg_id,
+                            device_result.s0, device_result.s1,
+                            device_result.s2, device_result.s3,
+                            expected_result.s0, expected_result.s1,
+                            expected_result.s2, expected_result.s3);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int non_uniform_size = gws % lws;
+        log_info("  sub_group_inverse_ballot...\n");
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+        // no work here
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        cl_uint4 expected_result, device_result;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+        if (non_uniform_size) wg_number++;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            // Map to array indexed to array indexed by local ID and sub group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                mx[wi_id] = x[wi_id]; // read host inputs for work_group
+                my[wi_id] = y[wi_id]; // read device outputs for work_group
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                // take index of array where info which work_item will
+                // be broadcast its value is stored
+                int midx = 4 * wg_offset + 2;
+                // take subgroup local id of this work_item
+                // Check result
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // for each subgroup work item
+
+                    wi_id & 1 ? expected_result = { 1, 0, 0, 1 }
+                              : expected_result = { 1, 0, 0, 2 };
+
+                    device_result = my[wg_offset + wi_id];
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error(
+                            "ERROR: sub_group_%s mismatch for local id %d in "
+                            "sub group %d in group %d obtained {%d, %d, %d, "
+                            "%d}, expected {%d, %d, %d, %d}\n",
+                            operation_names(operation), wi_id, sb_id, wg_id,
+                            device_result.s0, device_result.s1,
+                            device_result.s2, device_result.s3,
+                            expected_result.s0, expected_result.s1,
+                            expected_result.s2, expected_result.s3);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+
+        log_info("  sub_group_inverse_ballot... passed\n");
+        return TEST_PASS;
+    }
+};
+
+
+// Test for bit count/inclusive and exclusive scan/ find lsb msb ballot function
+template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+            wg_number++;
+        }
+        int e;
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                if (operation == BallotOp::ballot_bit_count
+                    || operation == BallotOp::ballot_inclusive_scan
+                    || operation == BallotOp::ballot_exclusive_scan)
+                {
+                    set_randomdata_for_subgroup<Ty>(t, wg_offset, current_sbs);
+                }
+                else if (operation == BallotOp::ballot_find_lsb
+                         || operation == BallotOp::ballot_find_msb)
+                {
+                    // Regarding to the spec, find lsb and find msb result is
+                    // undefined behavior if input value is zero, so generate
+                    // only non-zero values.
+                    for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                    {
+                        char x = (genrand_int32(gMTdata)) & 0xff;
+                        // undefined behaviour in case of 0;
+                        x = x ? x : 1;
+                        memset(&t[wg_offset + wi_id], x, sizeof(Ty));
+                    }
+                }
+                else
+                {
+                    log_error("Unknown operation...");
+                }
+            }
+
+            // Now map into work group using map from device
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+
+            x += lws;
+            m += 4 * lws;
+        }
+    }
+
+    static bs128 getImportantBits(cl_uint sub_group_local_id,
+                                  cl_uint sub_group_size)
+    {
+        bs128 mask;
+        if (operation == BallotOp::ballot_bit_count
+            || operation == BallotOp::ballot_find_lsb
+            || operation == BallotOp::ballot_find_msb)
+        {
+            for (cl_uint i = 0; i < sub_group_size; ++i) mask.set(i);
+        }
+        else if (operation == BallotOp::ballot_inclusive_scan
+                 || operation == BallotOp::ballot_exclusive_scan)
+        {
+            for (cl_uint i = 0; i <= sub_group_local_id; ++i) mask.set(i);
+            if (operation == BallotOp::ballot_exclusive_scan)
+                mask.reset(sub_group_local_id);
+        }
+        return mask;
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        wg_number = non_uniform_size ? wg_number + 1 : wg_number;
+        cl_uint4 expected_result, device_result;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            // Map to array indexed to array indexed by local ID and sub group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                // read host inputs for work_group
+                mx[wi_id] = x[wi_id];
+                // read device outputs for work_group
+                my[wi_id] = y[wi_id];
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                // Check result
+                expected_result = { 0, 0, 0, 0 };
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // for subgroup element
+                    bs128 bs;
+                    // convert cl_uint4 input into std::bitset<128>
+                    bs |= bs128(mx[wg_offset + wi_id].s0)
+                        | (bs128(mx[wg_offset + wi_id].s1) << 32)
+                        | (bs128(mx[wg_offset + wi_id].s2) << 64)
+                        | (bs128(mx[wg_offset + wi_id].s3) << 96);
+                    bs &= getImportantBits(wi_id, current_sbs);
+                    device_result = my[wg_offset + wi_id];
+                    if (operation == BallotOp::ballot_inclusive_scan
+                        || operation == BallotOp::ballot_exclusive_scan
+                        || operation == BallotOp::ballot_bit_count)
+                    {
+                        expected_result.s0 = bs.count();
+                        if (!compare(device_result, expected_result))
+                        {
+                            log_error("ERROR: sub_group_%s "
+                                      "mismatch for local id %d in sub group "
+                                      "%d in group %d obtained {%d, %d, %d, "
+                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      operation_names(operation), wi_id, sb_id,
+                                      wg_id, device_result.s0, device_result.s1,
+                                      device_result.s2, device_result.s3,
+                                      expected_result.s0, expected_result.s1,
+                                      expected_result.s2, expected_result.s3);
+                            return TEST_FAIL;
+                        }
+                    }
+                    else if (operation == BallotOp::ballot_find_lsb)
+                    {
+                        for (int id = 0; id < current_sbs; ++id)
+                        {
+                            if (bs.test(id))
+                            {
+                                expected_result.s0 = id;
+                                break;
+                            }
+                        }
+                        if (!compare(device_result, expected_result))
+                        {
+                            log_error("ERROR: sub_group_ballot_find_lsb "
+                                      "mismatch for local id %d in sub group "
+                                      "%d in group %d obtained {%d, %d, %d, "
+                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      wi_id, sb_id, wg_id, device_result.s0,
+                                      device_result.s1, device_result.s2,
+                                      device_result.s3, expected_result.s0,
+                                      expected_result.s1, expected_result.s2,
+                                      expected_result.s3);
+                            return TEST_FAIL;
+                        }
+                    }
+                    else if (operation == BallotOp::ballot_find_msb)
+                    {
+                        for (int id = current_sbs - 1; id >= 0; --id)
+                        {
+                            if (bs.test(id))
+                            {
+                                expected_result.s0 = id;
+                                break;
+                            }
+                        }
+                        if (!compare(device_result, expected_result))
+                        {
+                            log_error("ERROR: sub_group_ballot_find_msb "
+                                      "mismatch for local id %d in sub group "
+                                      "%d in group %d obtained {%d, %d, %d, "
+                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      wi_id, sb_id, wg_id, device_result.s0,
+                                      device_result.s1, device_result.s2,
+                                      device_result.s3, expected_result.s0,
+                                      expected_result.s1, expected_result.s2,
+                                      expected_result.s3);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  sub_group_ballot_%s(%s)... passed\n",
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+// test mask functions
+template <typename Ty, BallotOp operation> struct SMASK
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, l, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int wg_number = gws / lws;
+        log_info("  get_sub_group_%s_mask...\n", operation_names(operation));
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                // Produce expected masks for each work item in the subgroup
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                {
+                    int midx = 4 * wg_offset + 4 * wi_id;
+                    cl_uint max_sub_group_size = m[midx + 2];
+                    cl_uint4 expected_mask = { 0 };
+                    expected_mask = generate_bit_mask(
+                        wi_id, operation_names(operation), max_sub_group_size);
+                    set_value(t[wg_offset + wi_id], expected_mask);
+                }
+            }
+
+            // Now map into work group using map from device
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+            x += lws;
+            m += 4 * lws;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        Ty expected_result, device_result;
+        int wg_number = gws / lws;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                mx[wi_id] = x[wi_id]; // read host inputs for work_group
+                my[wi_id] = y[wi_id]; // read device outputs for work_group
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            {
+                int wg_offset = sb_id * sbs;
+                int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+
+                // Check result
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // inside the subgroup
+                    expected_result =
+                        mx[wg_offset + wi_id]; // read host input for subgroup
+                    device_result =
+                        my[wg_offset
+                           + wi_id]; // read device outputs for subgroup
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error("ERROR:  get_sub_group_%s_mask... mismatch "
+                                  "for local id %d in sub group %d in group "
+                                  "%d, obtained %d, expected %d\n",
+                                  operation_names(operation), wi_id, sb_id,
+                                  wg_id, device_result, expected_result);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  get_sub_group_%s_mask... passed\n",
+                 operation_names(operation));
+        return TEST_PASS;
+    }
+};
+
+static const char *bcast_non_uniform_source =
+    "__kernel void test_bcast_non_uniform(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
+    "        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n"
+    "    } else {\n"
+    "       out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n"
+    "    }\n"
+    "}\n";
+
+static const char *bcast_first_source =
+    "__kernel void test_bcast_first(const __global Type *in, __global int4 "
+    "*xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
+    "       out[gid] = sub_group_broadcast_first(x);\n"
+    "    } else {\n"
+    "       out[gid] = sub_group_broadcast_first(x);\n"
+    "    }\n"
+    "}\n";
+
+static const char *ballot_bit_count_source =
+    "__kernel void test_sub_group_ballot_bit_count(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_inclusive_scan_source =
+    "__kernel void test_sub_group_ballot_inclusive_scan(const __global Type "
+    "*in, __global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_exclusive_scan_source =
+    "__kernel void test_sub_group_ballot_exclusive_scan(const __global Type "
+    "*in, __global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_find_lsb_source =
+    "__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_find_msb_source =
+    "__kernel void test_sub_group_ballot_find_msb(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);"
+    "    value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);"
+    "    out[gid] = value ;"
+    "}\n";
+
+static const char *get_subgroup_ge_mask_source =
+    "__kernel void test_get_sub_group_ge_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_ge_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_gt_mask_source =
+    "__kernel void test_get_sub_group_gt_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_gt_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_le_mask_source =
+    "__kernel void test_get_sub_group_le_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_le_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_lt_mask_source =
+    "__kernel void test_get_sub_group_lt_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_lt_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_eq_mask_source =
+    "__kernel void test_get_sub_group_eq_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_eq_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *ballot_source =
+    "__kernel void test_sub_group_ballot(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "uint4 full_ballot = sub_group_ballot(1);\n"
+    "uint divergence_mask;\n"
+    "uint4 partial_ballot;\n"
+    "uint gid = get_global_id(0);"
+    "XY(xy,gid);\n"
+    "if (get_sub_group_local_id() & 1) {\n"
+    "    divergence_mask = 0xaaaaaaaa;\n"
+    "    partial_ballot = sub_group_ballot(1);\n"
+    "} else {\n"
+    "    divergence_mask = 0x55555555;\n"
+    "    partial_ballot = sub_group_ballot(1);\n"
+    "}\n"
+    " size_t lws = get_local_size(0);\n"
+    "uint4 masked_ballot = full_ballot;\n"
+    "masked_ballot.x &= divergence_mask;\n"
+    "masked_ballot.y &= divergence_mask;\n"
+    "masked_ballot.z &= divergence_mask;\n"
+    "masked_ballot.w &= divergence_mask;\n"
+    "out[gid] = all(masked_ballot == partial_ballot);\n"
+
+    "} \n";
+
+static const char *ballot_source_inverse =
+    "__kernel void test_sub_group_ballot_inverse(const __global "
+    "Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(10,0,0,0);\n"
+    "    if (get_sub_group_local_id() & 1) {"
+    "        uint4 partial_ballot_mask = "
+    "(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);"
+    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
+    "            value = (uint4)(1,0,0,1);\n"
+    "        } else {\n"
+    "            value = (uint4)(0,0,0,1);\n"
+    "        }\n"
+    "    } else {\n"
+    "       uint4 partial_ballot_mask = "
+    "(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);"
+    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
+    "            value = (uint4)(1,0,0,2);\n"
+    "        } else {\n"
+    "            value = (uint4)(0,0,0,2);\n"
+    "        }\n"
+    "    }\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_bit_extract_source =
+    "__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint index = xy[gid].z;\n"
+    "    uint4 value = (uint4)(10,0,0,0);\n"
+    "    if (get_sub_group_local_id() & 1) {"
+    "       if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n"
+    "           value = (uint4)(1,0,0,1);\n"
+    "       } else {\n"
+    "           value = (uint4)(0,0,0,1);\n"
+    "       }\n"
+    "    } else {\n"
+    "       if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n"
+    "           value = (uint4)(1,0,0,2);\n"
+    "       } else {\n"
+    "           value = (uint4)(0,0,0,2);\n"
+    "       }\n"
+    "    }\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
+{
+    int error =
+        rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
+            "test_bcast_non_uniform", bcast_non_uniform_source);
+    return error;
+}
+
+
+}
+
+int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int num_elements)
+{
+    std::vector<std::string> required_extensions = { "cl_khr_subgroup_ballot" };
+    constexpr size_t global_work_size = 170;
+    constexpr size_t local_work_size = 64;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    // non uniform broadcast functions
+    int error = run_non_uniform_broadcast_for_type<cl_int>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_int3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_uint>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_uint3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_char>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_char3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_uchar>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_uchar3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_short>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_short3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_ushort>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_ushort3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_long>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_long3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_ulong>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_ulong3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_float>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_float3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_double>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_double3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half3>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half4>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half8>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half16>(rft);
+
+    // broadcast first functions
+    error |=
+        rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
+            "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_uint,
+                          BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_long,
+                          BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_ulong,
+                          BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_short,
+                          BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_ushort,
+                          BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_char,
+                          BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_uchar,
+                          BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_float,
+                          BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_double,
+                          BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<
+        subgroups::cl_half,
+        BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+
+    // mask functions
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
+        "test_get_sub_group_eq_mask", get_subgroup_eq_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
+        "test_get_sub_group_ge_mask", get_subgroup_ge_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
+        "test_get_sub_group_gt_mask", get_subgroup_gt_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
+        "test_get_sub_group_le_mask", get_subgroup_le_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
+        "test_get_sub_group_lt_mask", get_subgroup_lt_mask_source);
+
+    // ballot functions
+    error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot",
+                                                    ballot_source);
+    error |= rft.run_impl<cl_uint4,
+                          BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
+        "test_sub_group_ballot_inverse", ballot_source_inverse);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
+        "test_sub_group_ballot_bit_extract", ballot_bit_extract_source);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
+        "test_sub_group_ballot_bit_count", ballot_bit_count_source);
+    error |= rft.run_impl<
+        cl_uint4,
+        BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
+        "test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source);
+    error |= rft.run_impl<
+        cl_uint4,
+        BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
+        "test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
+        "test_sub_group_ballot_find_lsb", ballot_find_lsb_source);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
+        "test_sub_group_ballot_find_msb", ballot_find_msb_source);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
new file mode 100644
index 0000000000..588e9cee18
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -0,0 +1,340 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+
+#define CLUSTER_SIZE 4
+#define CLUSTER_SIZE_STR "4"
+
+namespace {
+static const char *redadd_clustered_source =
+    "__kernel void test_redadd_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redmax_clustered_source =
+    "__kernel void test_redmax_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redmin_clustered_source =
+    "__kernel void test_redmin_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redmul_clustered_source =
+    "__kernel void test_redmul_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redand_clustered_source =
+    "__kernel void test_redand_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redor_clustered_source =
+    "__kernel void test_redor_clustered(const __global Type *in, __global int4 "
+    "*xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redxor_clustered_source =
+    "__kernel void test_redxor_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redand_clustered_logical_source =
+    "__kernel void test_redand_clustered_logical(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
+    ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = "
+    "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
+    "}\n";
+
+static const char *redor_clustered_logical_source =
+    "__kernel void test_redor_clustered_logical(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
+    ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = "
+    "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
+    "}\n";
+
+static const char *redxor_clustered_logical_source =
+    "__kernel void test_redxor_clustered_logical(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if ( sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
+    ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = "
+    "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
+    "}\n";
+
+
+// DESCRIPTION:
+// Test for reduce cluster functions
+template <typename Ty, ArithmeticOp operation> struct RED_CLU
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        ng = ng / nw;
+        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ...\n",
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 sizeof(Ty));
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
+
+        for (int k = 0; k < ng; ++k)
+        {
+            std::vector<cl_int> data_type_sizes;
+            // Map to array indexed to array indexed by local ID and sub group
+            for (int j = 0; j < nw; ++j)
+            {
+                mx[j] = x[j];
+                my[j] = y[j];
+                data_type_sizes.push_back(m[4 * j + 3]);
+            }
+
+            for (cl_int dts : data_type_sizes)
+            {
+                if (dts != sizeof(Ty))
+                {
+                    log_error("ERROR: sub_group_clustered_reduce_%s(%s) "
+                              "wrong data type size detected, expected: %d, "
+                              "used by device %d, in group %d\n",
+                              operation_names(operation),
+                              TypeManager<Ty>::name(), sizeof(Ty), dts, k);
+                    return TEST_FAIL;
+                }
+            }
+
+            for (int j = 0; j < nj; ++j)
+            {
+                int ii = j * ns;
+                int n = ii + ns > nw ? nw - ii : ns;
+                int midx = 4 * ii + 2;
+                std::vector<Ty> clusters_results;
+                int clusters_counter = ns / CLUSTER_SIZE;
+                clusters_results.resize(clusters_counter);
+
+                // Compute target
+                Ty tr = mx[ii];
+                for (int i = 0; i < n; ++i)
+                {
+                    if (i % CLUSTER_SIZE == 0)
+                        tr = mx[ii + i];
+                    else
+                        tr = calculate<Ty>(tr, mx[ii + i], operation);
+                    clusters_results[i / CLUSTER_SIZE] = tr;
+                }
+
+                // Check result
+                for (int i = 0; i < n; ++i)
+                {
+                    Ty rr = my[ii + i];
+                    tr = clusters_results[i / CLUSTER_SIZE];
+                    if (!compare(rr, tr))
+                    {
+                        log_error(
+                            "ERROR: sub_group_clustered_reduce_%s(%s) mismatch "
+                            "for local id %d in sub group %d in group %d\n",
+                            operation_names(operation), TypeManager<Ty>::name(),
+                            i, j, k);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n",
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 sizeof(Ty));
+        return TEST_PASS;
+    }
+};
+
+template <typename T>
+int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
+        "test_redadd_clustered", redadd_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
+        "test_redmax_clustered", redmax_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
+        "test_redmin_clustered", redmin_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
+        "test_redmul_clustered", redmul_clustered_source);
+    return error;
+}
+template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
+        "test_redand_clustered", redand_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
+        "test_redor_clustered", redor_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
+        "test_redxor_clustered", redxor_clustered_source);
+    return error;
+}
+template <typename T>
+int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
+        "test_redand_clustered_logical", redand_clustered_logical_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
+        "test_redor_clustered_logical", redor_clustered_logical_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
+        "test_redxor_clustered_logical", redxor_clustered_logical_source);
+
+    return error;
+}
+}
+
+int test_subgroup_functions_clustered_reduce(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_clustered_reduce"
+    };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_uint>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_long>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_ulong>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_short>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_ushort>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_char>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_uchar>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_float>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_double>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<subgroups::cl_half>(rft);
+
+    error |= run_cluster_and_or_xor_for_type<cl_int>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_uint>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_long>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_ulong>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_short>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_ushort>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_char>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_uchar>(rft);
+
+    error |= run_cluster_logical_and_or_xor_for_type<cl_int>(rft);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp
new file mode 100644
index 0000000000..98401b8ef0
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp
@@ -0,0 +1,138 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_kernels.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+
+namespace {
+
+template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
+        "test_bcast", bcast_source);
+    return error;
+}
+
+template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
+                                                               redadd_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
+                                                            redmax_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
+                                                            redmin_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
+                                                             scinadd_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
+                                                             scinmax_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
+                                                             scinmin_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
+                                                             scexadd_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
+                                                             scexmax_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
+                                                             scexmin_source);
+    return error;
+}
+
+
+}
+
+int test_subgroup_functions_extended_types(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_extended_types"
+    };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_broadcast_for_extended_type<cl_uint2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
+    error |= run_broadcast_for_extended_type<cl_uint4>(rft);
+    error |= run_broadcast_for_extended_type<cl_uint8>(rft);
+    error |= run_broadcast_for_extended_type<cl_uint16>(rft);
+    error |= run_broadcast_for_extended_type<cl_int2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_int3>(rft);
+    error |= run_broadcast_for_extended_type<cl_int4>(rft);
+    error |= run_broadcast_for_extended_type<cl_int8>(rft);
+    error |= run_broadcast_for_extended_type<cl_int16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_ulong2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_ulong3>(rft);
+    error |= run_broadcast_for_extended_type<cl_ulong4>(rft);
+    error |= run_broadcast_for_extended_type<cl_ulong8>(rft);
+    error |= run_broadcast_for_extended_type<cl_ulong16>(rft);
+    error |= run_broadcast_for_extended_type<cl_long2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_long3>(rft);
+    error |= run_broadcast_for_extended_type<cl_long4>(rft);
+    error |= run_broadcast_for_extended_type<cl_long8>(rft);
+    error |= run_broadcast_for_extended_type<cl_long16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_float2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_float3>(rft);
+    error |= run_broadcast_for_extended_type<cl_float4>(rft);
+    error |= run_broadcast_for_extended_type<cl_float8>(rft);
+    error |= run_broadcast_for_extended_type<cl_float16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_double2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_double3>(rft);
+    error |= run_broadcast_for_extended_type<cl_double4>(rft);
+    error |= run_broadcast_for_extended_type<cl_double8>(rft);
+    error |= run_broadcast_for_extended_type<cl_double16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_ushort2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_ushort3>(rft);
+    error |= run_broadcast_for_extended_type<cl_ushort4>(rft);
+    error |= run_broadcast_for_extended_type<cl_ushort8>(rft);
+    error |= run_broadcast_for_extended_type<cl_ushort16>(rft);
+    error |= run_broadcast_for_extended_type<cl_short2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_short3>(rft);
+    error |= run_broadcast_for_extended_type<cl_short4>(rft);
+    error |= run_broadcast_for_extended_type<cl_short8>(rft);
+    error |= run_broadcast_for_extended_type<cl_short16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_uchar2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_uchar3>(rft);
+    error |= run_broadcast_for_extended_type<cl_uchar4>(rft);
+    error |= run_broadcast_for_extended_type<cl_uchar8>(rft);
+    error |= run_broadcast_for_extended_type<cl_uchar16>(rft);
+    error |= run_broadcast_for_extended_type<cl_char2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_char3>(rft);
+    error |= run_broadcast_for_extended_type<cl_char4>(rft);
+    error |= run_broadcast_for_extended_type<cl_char8>(rft);
+    error |= run_broadcast_for_extended_type<cl_char16>(rft);
+
+    error |= run_broadcast_for_extended_type<subgroups::cl_half2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half3>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half4>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half8>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half16>(rft);
+
+    error |= run_scan_reduction_for_type<cl_uchar>(rft);
+    error |= run_scan_reduction_for_type<cl_char>(rft);
+    error |= run_scan_reduction_for_type<cl_ushort>(rft);
+    error |= run_scan_reduction_for_type<cl_short>(rft);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
new file mode 100644
index 0000000000..eb46ff092c
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -0,0 +1,473 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "harness/typeWrappers.h"
+#include "subgroup_common_templates.h"
+
+namespace {
+
+static const char *scinadd_non_uniform_source = R"(
+    __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
+            }
+    }
+)";
+
+static const char *scinmax_non_uniform_source = R"(
+    __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
+            }
+    }
+)";
+
+static const char *scinmin_non_uniform_source = R"(
+    __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
+            }
+    }
+)";
+
+static const char *scinmul_non_uniform_source = R"(
+    __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
+            }
+    }
+)";
+
+static const char *scinand_non_uniform_source = R"(
+    __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scinor_non_uniform_source = R"(
+    __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scinxor_non_uniform_source = R"(
+    __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *scinand_non_uniform_logical_source = R"(
+    __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scinor_non_uniform_logical_source = R"(
+    __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scinxor_non_uniform_logical_source = R"(
+    __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *scexadd_non_uniform_source = R"(
+    __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
+            }
+    }
+)";
+
+static const char *scexmax_non_uniform_source = R"(
+    __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
+            }
+    }
+)";
+
+static const char *scexmin_non_uniform_source = R"(
+    __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
+            }
+    }
+)";
+
+static const char *scexmul_non_uniform_source = R"(
+    __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
+            }
+    }
+)";
+
+static const char *scexand_non_uniform_source = R"(
+    __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scexor_non_uniform_source = R"(
+    __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scexxor_non_uniform_source = R"(
+    __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *scexand_non_uniform_logical_source = R"(
+    __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scexor_non_uniform_logical_source = R"(
+    __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scexxor_non_uniform_logical_source = R"(
+    __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *redadd_non_uniform_source = R"(
+    __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
+            }
+    }
+)";
+
+static const char *redmax_non_uniform_source = R"(
+    __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
+            }
+    }
+)";
+
+static const char *redmin_non_uniform_source = R"(
+    __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
+            }
+    }
+)";
+
+static const char *redmul_non_uniform_source = R"(
+    __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
+            }
+    }
+)";
+
+static const char *redand_non_uniform_source = R"(
+    __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
+            }
+    }
+)";
+
+static const char *redor_non_uniform_source = R"(
+    __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
+            }
+    }
+)";
+
+static const char *redxor_non_uniform_source = R"(
+    __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *redand_non_uniform_logical_source = R"(
+    __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
+            }
+    }
+)";
+
+static const char *redor_non_uniform_logical_source = R"(
+    __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
+            }
+    }
+)";
+
+static const char *redxor_non_uniform_logical_source = R"(
+    __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
+            }
+    }
+)";
+
+template <typename T>
+int run_functions_add_mul_max_min_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
+        "test_scinadd_non_uniform", scinadd_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
+        "test_scinmul_non_uniform", scinmul_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
+        "test_scinmax_non_uniform", scinmax_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
+        "test_scinmin_non_uniform", scinmin_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
+        "test_scexadd_non_uniform", scexadd_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
+        "test_scexmul_non_uniform", scexmul_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
+        "test_scexmax_non_uniform", scexmax_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
+        "test_scexmin_non_uniform", scexmin_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
+        "test_redadd_non_uniform", redadd_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
+        "test_redmul_non_uniform", redmul_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
+        "test_redmax_non_uniform", redmax_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
+        "test_redmin_non_uniform", redmin_non_uniform_source);
+    return error;
+}
+
+template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
+        "test_scinand_non_uniform", scinand_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
+        "test_scinor_non_uniform", scinor_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
+        "test_scinxor_non_uniform", scinxor_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
+        "test_scexand_non_uniform", scexand_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
+        "test_scexor_non_uniform", scexor_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
+        "test_scexxor_non_uniform", scexxor_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
+        "test_redand_non_uniform", redand_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
+        "test_redor_non_uniform", redor_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
+        "test_redxor_non_uniform", redxor_non_uniform_source);
+    return error;
+}
+
+template <typename T>
+int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
+        "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
+        "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
+        "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
+        "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
+        "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
+        "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
+        "test_redand_non_uniform_logical", redand_non_uniform_logical_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
+        "test_redor_non_uniform_logical", redor_non_uniform_logical_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
+        "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
+    return error;
+}
+
+}
+
+int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_non_uniform_arithmetic"
+    };
+    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
+                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
+                                 0x00ffff00, 0x80000000, 0xaaaaaaaa };
+
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions, masks);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_uint>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_long>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_ulong>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_short>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_ushort>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_char>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_uchar>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_float>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_double>(rft);
+    error |= run_functions_add_mul_max_min_for_type<subgroups::cl_half>(rft);
+
+    error |= run_functions_and_or_xor_for_type<cl_int>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_uint>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_long>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_ulong>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_short>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_ushort>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_char>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_uchar>(rft);
+
+    error |= run_functions_logical_and_or_xor_for_type<cl_int>(rft);
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
new file mode 100644
index 0000000000..2b00b4dd27
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -0,0 +1,303 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "harness/typeWrappers.h"
+#include <set>
+
+namespace {
+
+template <typename T, NonUniformVoteOp operation> struct VOTE
+{
+    static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        int last_subgroup_size = 0;
+        ii = 0;
+
+        log_info("  sub_group_%s%s... \n",
+                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
+                 operation_names(operation));
+
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x data type (%s)\n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask,
+                 TypeManager<T>::name());
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+        if (operation == NonUniformVoteOp::elect) return;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+                int e = genrand_int32(gMTdata) % 3;
+
+                for (i = 0; i < n; i++)
+                {
+                    if (e == 2)
+                    { // set once 0 and once 1 alternately
+                        int value = i % 2;
+                        set_value(t[ii + i], value);
+                    }
+                    else
+                    { // set 0/1 for all work items in subgroup
+                        set_value(t[ii + i], e);
+                    }
+                }
+            }
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            {
+                x[j] = t[j];
+            }
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(T *x, T *y, T *mx, T *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        cl_int tr, rr;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        if (non_uniform_size) ng++;
+        int last_subgroup_size = 0;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+
+                rr = 0;
+                if (operation == NonUniformVoteOp::all
+                    || operation == NonUniformVoteOp::all_equal)
+                    tr = 1;
+                if (operation == NonUniformVoteOp::any) tr = 0;
+
+                std::set<int> active_work_items;
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (work_items_mask & check_work_item)
+                    {
+                        active_work_items.insert(i);
+                        switch (operation)
+                        {
+                            case NonUniformVoteOp::elect: break;
+
+                            case NonUniformVoteOp::all:
+                                tr &=
+                                    !compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
+                                break;
+                            case NonUniformVoteOp::any:
+                                tr |=
+                                    !compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
+                                break;
+                            case NonUniformVoteOp::all_equal:
+                                tr &= compare_ordered<T>(
+                                          mx[ii + i],
+                                          mx[ii + *active_work_items.begin()])
+                                    ? 1
+                                    : 0;
+                                break;
+                            default:
+                                log_error("Unknown operation\n");
+                                return TEST_FAIL;
+                        }
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    log_info("  no one workitem acitve... in workgroup id = %d "
+                             "subgroup id = %d\n",
+                             k, j);
+                }
+                else
+                {
+                    auto lowest_active = active_work_items.begin();
+                    for (const int &active_work_item : active_work_items)
+                    {
+                        i = active_work_item;
+                        if (operation == NonUniformVoteOp::elect)
+                        {
+                            i == *lowest_active ? tr = 1 : tr = 0;
+                        }
+
+                        // normalize device values on host, non zero set 1.
+                        rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
+
+                        if (rr != tr)
+                        {
+                            log_error("ERROR: sub_group_%s() \n",
+                                      operation_names(operation));
+                            log_error(
+                                "mismatch for work item %d sub group %d in "
+                                "work group %d. Expected: %d Obtained: %d\n",
+                                i, j, k, tr, rr);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  sub_group_%s%s... passed\n",
+                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
+                 operation_names(operation));
+        return TEST_PASS;
+    }
+};
+static const char *elect_source = R"(
+    __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_elect();
+            }
+    }
+)";
+
+static const char *non_uniform_any_source = R"(
+    __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_any(in[gid]);
+            }
+    }
+)";
+
+static const char *non_uniform_all_source = R"(
+    __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_all(in[gid]);
+            }
+    }
+)";
+
+static const char *non_uniform_all_equal_source = R"(
+    __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_all_equal(in[gid]);
+            }
+    }
+)";
+
+template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
+        "test_non_uniform_all_equal", non_uniform_all_equal_source);
+    return error;
+}
+}
+
+int test_subgroup_functions_non_uniform_vote(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_non_uniform_vote"
+    };
+
+    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
+                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
+                                 0x00ffff00, 0x80000000 };
+    constexpr size_t global_work_size = 170;
+    constexpr size_t local_work_size = 64;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions, masks);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_vote_all_equal_for_type<cl_int>(rft);
+    error |= run_vote_all_equal_for_type<cl_uint>(rft);
+    error |= run_vote_all_equal_for_type<cl_long>(rft);
+    error |= run_vote_all_equal_for_type<cl_ulong>(rft);
+    error |= run_vote_all_equal_for_type<cl_float>(rft);
+    error |= run_vote_all_equal_for_type<cl_double>(rft);
+    error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
+
+    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
+        "test_non_uniform_all", non_uniform_all_source);
+    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
+        "test_elect", elect_source);
+    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
+        "test_non_uniform_any", non_uniform_any_source);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp
new file mode 100644
index 0000000000..049f09824b
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp
@@ -0,0 +1,78 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+#include <bitset>
+
+namespace {
+
+static const char* shuffle_xor_source =
+    "__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
+    "}\n";
+
+static const char* shuffle_source =
+    "__kernel void test_sub_group_shuffle(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle(x, xy[gid].z);"
+    "}\n";
+
+template <typename T> int run_shuffle_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
+        "test_sub_group_shuffle", shuffle_source);
+    error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
+        "test_sub_group_shuffle_xor", shuffle_xor_source);
+    return error;
+}
+
+}
+
+int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
+                                    cl_command_queue queue, int num_elements)
+{
+    std::vector<std::string> required_extensions{ "cl_khr_subgroup_shuffle" };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_shuffle_for_type<cl_int>(rft);
+    error |= run_shuffle_for_type<cl_uint>(rft);
+    error |= run_shuffle_for_type<cl_long>(rft);
+    error |= run_shuffle_for_type<cl_ulong>(rft);
+    error |= run_shuffle_for_type<cl_short>(rft);
+    error |= run_shuffle_for_type<cl_ushort>(rft);
+    error |= run_shuffle_for_type<cl_char>(rft);
+    error |= run_shuffle_for_type<cl_uchar>(rft);
+    error |= run_shuffle_for_type<cl_float>(rft);
+    error |= run_shuffle_for_type<cl_double>(rft);
+    error |= run_shuffle_for_type<subgroups::cl_half>(rft);
+
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
new file mode 100644
index 0000000000..6000c9702a
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
@@ -0,0 +1,81 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/conversions.h"
+#include "harness/typeWrappers.h"
+
+namespace {
+
+static const char* shuffle_down_source =
+    "__kernel void test_sub_group_shuffle_down(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
+    "}\n";
+static const char* shuffle_up_source =
+    "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
+    "}\n";
+
+template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
+        "test_sub_group_shuffle_up", shuffle_up_source);
+    error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
+        "test_sub_group_shuffle_down", shuffle_down_source);
+    return error;
+}
+
+}
+
+int test_subgroup_functions_shuffle_relative(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_shuffle_relative"
+    };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_shuffle_relative_for_type<cl_int>(rft);
+    error |= run_shuffle_relative_for_type<cl_uint>(rft);
+    error |= run_shuffle_relative_for_type<cl_long>(rft);
+    error |= run_shuffle_relative_for_type<cl_ulong>(rft);
+    error |= run_shuffle_relative_for_type<cl_short>(rft);
+    error |= run_shuffle_relative_for_type<cl_ushort>(rft);
+    error |= run_shuffle_relative_for_type<cl_char>(rft);
+    error |= run_shuffle_relative_for_type<cl_uchar>(rft);
+    error |= run_shuffle_relative_for_type<cl_float>(rft);
+    error |= run_shuffle_relative_for_type<cl_double>(rft);
+    error |= run_shuffle_relative_for_type<subgroups::cl_half>(rft);
+
+    return error;
+}
diff --git a/test_conformance/subgroups/test_workgroup.cpp b/test_conformance/subgroups/test_workgroup.cpp
deleted file mode 100644
index 779d30f680..0000000000
--- a/test_conformance/subgroups/test_workgroup.cpp
+++ /dev/null
@@ -1,727 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "procs.h"
-#include "subhelpers.h"
-#include "harness/conversions.h"
-#include "harness/typeWrappers.h"
-
-static const char *any_source = "__kernel void test_any(const __global Type "
-                                "*in, __global int2 *xy, __global Type *out)\n"
-                                "{\n"
-                                "    int gid = get_global_id(0);\n"
-                                "    XY(xy,gid);\n"
-                                "    out[gid] = sub_group_any(in[gid]);\n"
-                                "}\n";
-
-static const char *all_source = "__kernel void test_all(const __global Type "
-                                "*in, __global int2 *xy, __global Type *out)\n"
-                                "{\n"
-                                "    int gid = get_global_id(0);\n"
-                                "    XY(xy,gid);\n"
-                                "    out[gid] = sub_group_all(in[gid]);\n"
-                                "}\n";
-
-static const char *bcast_source =
-    "__kernel void test_bcast(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    size_t loid = (size_t)((int)x % 100);\n"
-    "    out[gid] = sub_group_broadcast(x, loid);\n"
-    "}\n";
-
-static const char *redadd_source =
-    "__kernel void test_redadd(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_reduce_add(in[gid]);\n"
-    "}\n";
-
-static const char *redmax_source =
-    "__kernel void test_redmax(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_reduce_max(in[gid]);\n"
-    "}\n";
-
-static const char *redmin_source =
-    "__kernel void test_redmin(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_reduce_min(in[gid]);\n"
-    "}\n";
-
-static const char *scinadd_source =
-    "__kernel void test_scinadd(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
-    "}\n";
-
-static const char *scinmax_source =
-    "__kernel void test_scinmax(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
-    "}\n";
-
-static const char *scinmin_source =
-    "__kernel void test_scinmin(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
-    "}\n";
-
-static const char *scexadd_source =
-    "__kernel void test_scexadd(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
-    "}\n";
-
-static const char *scexmax_source =
-    "__kernel void test_scexmax(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
-    "}\n";
-
-static const char *scexmin_source =
-    "__kernel void test_scexmin(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
-    "}\n";
-
-
-// Any/All test functions
-template <int Which> struct AA
-{
-    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        int e;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-                e = (int)(genrand_int32(gMTdata) % 3);
-
-                // Initialize data matrix indexed by local id and sub group id
-                switch (e)
-                {
-                    case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
-                    case 1:
-                        memset(&t[ii], 0, n * sizeof(cl_int));
-                        i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
-                        t[ii + i] = 41;
-                        break;
-                    case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
-                }
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   int ns, int nw, int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        cl_int taa, raa;
-
-        log_info("  sub_group_%s...\n", Which == 0 ? "any" : "all");
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Compute target
-                if (Which == 0)
-                {
-                    taa = 0;
-                    for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
-                }
-                else
-                {
-                    taa = 1;
-                    for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
-                }
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    raa = my[ii + i] != 0;
-                    if (raa != taa)
-                    {
-                        log_error("ERROR: sub_group_%s mismatch for local id "
-                                  "%d in sub group %d in group %d\n",
-                                  Which == 0 ? "any" : "all", i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Reduce functions
-template <typename Ty, int Which> struct RED
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                for (i = 0; i < n; ++i)
-                    t[ii + i] = (Ty)(
-                        (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, rr;
-
-        log_info("  sub_group_reduce_%s(%s)...\n",
-                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                 TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Compute target
-                if (Which == 0)
-                {
-                    // add
-                    tr = mx[ii];
-                    for (i = 1; i < n; ++i) tr += mx[ii + i];
-                }
-                else if (Which == 1)
-                {
-                    // max
-                    tr = mx[ii];
-                    for (i = 1; i < n; ++i)
-                        tr = tr > mx[ii + i] ? tr : mx[ii + i];
-                }
-                else if (Which == 2)
-                {
-                    // min
-                    tr = mx[ii];
-                    for (i = 1; i < n; ++i)
-                        tr = tr > mx[ii + i] ? mx[ii + i] : tr;
-                }
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    rr = my[ii + i];
-                    if (rr != tr)
-                    {
-                        log_error("ERROR: sub_group_reduce_%s(%s) mismatch for "
-                                  "local id %d in sub group %d in group %d\n",
-                                  Which == 0 ? "add"
-                                             : (Which == 1 ? "max" : "min"),
-                                  TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Scan Inclusive functions
-template <typename Ty, int Which> struct SCIN
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                for (i = 0; i < n; ++i)
-                    // t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
-                    // % ns + 1);
-                    t[ii + i] = (Ty)i;
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, rr;
-
-        log_info("  sub_group_scan_inclusive_%s(%s)...\n",
-                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                 TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    if (Which == 0)
-                    {
-                        tr = i == 0 ? mx[ii] : tr + mx[ii + i];
-                    }
-                    else if (Which == 1)
-                    {
-                        tr = i == 0 ? mx[ii]
-                                    : (tr > mx[ii + i] ? tr : mx[ii + i]);
-                    }
-                    else
-                    {
-                        tr = i == 0 ? mx[ii]
-                                    : (tr > mx[ii + i] ? mx[ii + i] : tr);
-                    }
-
-                    rr = my[ii + i];
-                    if (rr != tr)
-                    {
-                        log_error(
-                            "ERROR: sub_group_scan_inclusive_%s(%s) mismatch "
-                            "for local id %d in sub group %d in group %d\n",
-                            Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                            TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Scan Exclusive functions
-template <typename Ty, int Which> struct SCEX
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                for (i = 0; i < n; ++i)
-                    t[ii + i] = (Ty)(
-                        (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, trt, rr;
-
-        log_info("  sub_group_scan_exclusive_%s(%s)...\n",
-                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                 TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    if (Which == 0)
-                    {
-                        tr = i == 0 ? TypeIdentity<Ty, Which>::val() : tr + trt;
-                    }
-                    else if (Which == 1)
-                    {
-                        tr = i == 0 ? TypeIdentity<Ty, Which>::val()
-                                    : (trt > tr ? trt : tr);
-                    }
-                    else
-                    {
-                        tr = i == 0 ? TypeIdentity<Ty, Which>::val()
-                                    : (trt > tr ? tr : trt);
-                    }
-                    trt = mx[ii + i];
-                    rr = my[ii + i];
-
-                    if (rr != tr)
-                    {
-                        log_error(
-                            "ERROR: sub_group_scan_exclusive_%s(%s) mismatch "
-                            "for local id %d in sub group %d in group %d\n",
-                            Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                            TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Broadcast functios
-template <typename Ty> struct BC
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, l, n;
-        int nj = (nw + ns - 1) / ns;
-        int d = ns > 100 ? 100 : ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-                l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
-                    % (d > n ? n : d);
-
-                for (i = 0; i < n; ++i)
-                    t[ii + i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
-                                         % 100 * 100
-                                     + l);
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, l, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, rr;
-
-        log_info("  sub_group_broadcast(%s)...\n", TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-                l = (int)mx[ii] % 100;
-                tr = mx[ii + l];
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    rr = my[ii + i];
-                    if (rr != tr)
-                    {
-                        log_error("ERROR: sub_group_broadcast(%s) mismatch for "
-                                  "local id %d in sub group %d in group %d\n",
-                                  TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-#define G 2000
-#define L 200
-struct run_for_type
-{
-    run_for_type(cl_device_id device, cl_context context,
-                 cl_command_queue queue, int num_elements,
-                 bool useCoreSubgroups)
-    {
-        device_ = device;
-        context_ = context;
-        queue_ = queue;
-        num_elements_ = num_elements;
-        useCoreSubgroups_ = useCoreSubgroups;
-    }
-
-    template <typename T> cl_int run()
-    {
-        cl_int error;
-        error = test<T, BC<T>, G, L>::run(device_, context_, queue_,
-                                          num_elements_, "test_bcast",
-                                          bcast_source, 0, useCoreSubgroups_);
-        error |= test<T, RED<T, 0>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_redadd",
-            redadd_source, 0, useCoreSubgroups_);
-        error |= test<T, RED<T, 1>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_redmax",
-            redmax_source, 0, useCoreSubgroups_);
-        error |= test<T, RED<T, 2>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_redmin",
-            redmin_source, 0, useCoreSubgroups_);
-        error |= test<T, SCIN<T, 0>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scinadd",
-            scinadd_source, 0, useCoreSubgroups_);
-        error |= test<T, SCIN<T, 1>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scinmax",
-            scinmax_source, 0, useCoreSubgroups_);
-        error |= test<T, SCIN<T, 2>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scinmin",
-            scinmin_source, 0, useCoreSubgroups_);
-        error |= test<T, SCEX<T, 0>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scexadd",
-            scexadd_source, 0, useCoreSubgroups_);
-        error |= test<T, SCEX<T, 1>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scexmax",
-            scexmax_source, 0, useCoreSubgroups_);
-        error |= test<T, SCEX<T, 2>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scexmin",
-            scexmin_source, 0, useCoreSubgroups_);
-        return error;
-    }
-
-private:
-    cl_device_id device_;
-    cl_context context_;
-    cl_command_queue queue_;
-    int num_elements_;
-    bool useCoreSubgroups_;
-};
-
-// Entry point from main
-int test_work_group_functions(cl_device_id device, cl_context context,
-                              cl_command_queue queue, int num_elements,
-                              bool useCoreSubgroups)
-{
-    int error;
-    error = test<int, AA<0>, G, L>::run(device, context, queue, num_elements,
-                                        "test_any", any_source, 0,
-                                        useCoreSubgroups);
-    error |= test<int, AA<1>, G, L>::run(device, context, queue, num_elements,
-                                         "test_all", all_source, 0,
-                                         useCoreSubgroups);
-    run_for_type rft(device, context, queue, num_elements, useCoreSubgroups);
-    error |= rft.run<cl_uint>();
-    error |= rft.run<cl_int>();
-    error |= rft.run<cl_ulong>();
-    error |= rft.run<cl_long>();
-    error |= rft.run<float>();
-    error |= rft.run<double>();
-    // error |= rft.run<cl_half>();
-
-    return error;
-}
-
-int test_work_group_functions_core(cl_device_id device, cl_context context,
-                                   cl_command_queue queue, int num_elements)
-{
-    return test_work_group_functions(device, context, queue, num_elements,
-                                     true);
-}
-
-int test_work_group_functions_ext(cl_device_id device, cl_context context,
-                                  cl_command_queue queue, int num_elements)
-{
-    bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
-
-    if (!hasExtension)
-    {
-        log_info(
-            "Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-    return test_work_group_functions(device, context, queue, num_elements,
-                                     false);
-}

From 8e59817ba62e63f16b0f11c7113287d75f9582ec Mon Sep 17 00:00:00 2001
From: "jianguang.li" <891528583@qq.com>
Date: Wed, 7 Apr 2021 16:19:58 +0800
Subject: [PATCH 071/158] Fix conversion fail when build with -O2 fail (#1193)

* fix conversion fail when compile with gcc -O2

* fix format check error

* fix format check error
---
 test_conformance/conversions/fplib.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_conformance/conversions/fplib.cpp b/test_conformance/conversions/fplib.cpp
index 37707b83f0..e739b9ae3a 100644
--- a/test_conformance/conversions/fplib.cpp
+++ b/test_conformance/conversions/fplib.cpp
@@ -29,8 +29,8 @@ static uint32_t clz(uint64_t value)
 
     for( num_zeros = 0; num_zeros < (sizeof(uint64_t)*8); num_zeros++)
     {
-        if(0x8000000000000000 & (value << num_zeros))
-            break;
+        volatile uint64_t v = 0x8000000000000000ull & (value << num_zeros);
+        if (v) break;
     }
     return num_zeros;
 }

From b7e7a3eb65d80d6847bd522f66f876fd5f6fe938 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 13 Apr 2021 15:58:44 +0100
Subject: [PATCH 072/158] Remove unsupported code (#1211)

* Remove code for runtime measurement

The GetTime() and associated functions are not fully implemented on
Linux. This functionality is assumed to be untested, or unused at best.

Reduce differences between tests by removing this unnecessary feature.
It can be (re-)implemented later, if desired, once the math_brute_force
component is in better shape.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Coalesce if-statements

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Keep else branch

Address comments.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp        |  87 +--------------
 .../math_brute_force/binary_float.cpp         |  87 +--------------
 .../math_brute_force/binary_i_double.cpp      |  88 +--------------
 .../math_brute_force/binary_i_float.cpp       |  87 +--------------
 .../binary_operator_double.cpp                |  87 +--------------
 .../binary_operator_float.cpp                 |  87 +--------------
 .../binary_two_results_i_double.cpp           |  88 +--------------
 .../binary_two_results_i_float.cpp            |  90 +--------------
 .../math_brute_force/i_unary_double.cpp       |  67 -----------
 .../math_brute_force/i_unary_float.cpp        |  68 ------------
 .../math_brute_force/macro_binary_double.cpp  |  87 ---------------
 .../math_brute_force/macro_binary_float.cpp   |  85 --------------
 .../math_brute_force/macro_unary_double.cpp   |  68 ------------
 .../math_brute_force/macro_unary_float.cpp    |  68 ------------
 .../math_brute_force/mad_double.cpp           | 104 +-----------------
 .../math_brute_force/mad_float.cpp            | 102 +----------------
 test_conformance/math_brute_force/main.cpp    |  77 +------------
 .../math_brute_force/ternary_double.cpp       | 102 +----------------
 .../math_brute_force/ternary_float.cpp        | 102 +----------------
 .../math_brute_force/unary_double.cpp         |  77 +------------
 .../math_brute_force/unary_float.cpp          |  77 +------------
 .../unary_two_results_double.cpp              |  77 +------------
 .../unary_two_results_float.cpp               |  78 +------------
 .../unary_two_results_i_double.cpp            |  76 +------------
 .../unary_two_results_i_float.cpp             |  75 +------------
 .../math_brute_force/unary_u_double.cpp       |  68 +-----------
 .../math_brute_force/unary_u_float.cpp        |  76 +------------
 test_conformance/math_brute_force/utility.h   |   6 -
 28 files changed, 29 insertions(+), 2212 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 15a48a6664..e51327dfcd 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -436,95 +436,10 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index fb56c642d0..5a37a40771 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -428,95 +428,10 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
-            p2[j] = 0x3fc00000;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 686e32b405..c3c3eb1366 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -437,96 +437,10 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        double *p = (double *)gIn;
-        cl_int *p2 = (cl_int *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = 3;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error =
-                 clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                      BUFFER_SIZE / 2, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index d99382f1be..93cb910b34 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -430,95 +430,10 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000;
-            p2[j] = 3;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index a02e53ba36..09540b0374 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -430,95 +430,10 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index 39070cb2f2..a59be16316 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -422,95 +422,10 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
-            p2[j] = 0x3fc00000;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index d71585e65f..f548486f04 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -565,96 +565,10 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
 
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index b6f1f1bdf0..01c3ca7df8 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -550,98 +550,10 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
 
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index c3822f366b..040f8e65dc 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -289,73 +289,6 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index d1fb867df4..61ce9a236a 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -286,74 +286,6 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index cbcecb3345..7c40a18b75 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -406,93 +406,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        cl_ulong *p = (cl_ulong *)gIn;
-        cl_ulong *p2 = (cl_ulong *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-        {
-            p[j] =
-                (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
-            p2[j] =
-                (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index bbccbbe9b5..12c14dc5a6 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -397,91 +397,6 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 154d8ecb94..4fadbfc35b 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -272,74 +272,6 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_ulong *p = (cl_ulong *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 725a83168a..4e72748ae7 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -272,74 +272,6 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index 5e7dba9879..a5f591838a 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -285,111 +285,11 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        double *p3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-            p3[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
         vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
              maxErrorVal3);
+    }
+
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
index 453e43e79b..ff4e99d0dc 100644
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -284,109 +284,11 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-            p3[j] = genrand_int32(d);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
         vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
              maxErrorVal3);
+    }
+
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 0e1b40a443..c2a376f166 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -62,8 +62,6 @@ static int32_t gEndTestNumber = -1;
 int gSkipCorrectnessTesting = 0;
 int gStopOnError = 0;
 static bool gSkipRestOfTests;
-int gMeasureTimes = 0;
-int gReportAverageTimes = 0;
 int gForceFTZ = 0;
 int gWimpyMode = 0;
 int gHasDouble = 0;
@@ -87,7 +85,6 @@ int gCheckTininessBeforeRounding = 1;
 int gIsInRTZMode = 0;
 uint32_t gMaxVectorSizeIndex = VECTOR_SIZE_COUNT;
 uint32_t gMinVectorSizeIndex = 0;
-const char *method[] = { "Best", "Average" };
 void *gIn = NULL;
 void *gIn2 = NULL;
 void *gIn3 = NULL;
@@ -813,24 +810,8 @@ int main(int argc, const char *argv[])
     else if (gStopOnError)
         vlog("Stopping at first error.\n");
 
-    if (gMeasureTimes)
-    {
-        vlog("%s times are reported at right (cycles per element):\n",
-             method[gReportAverageTimes]);
-        vlog("\n");
-        if (gSkipCorrectnessTesting)
-            vlog("   \t               ");
-        else
-            vlog("   \t                                        ");
-        if (gWimpyMode) vlog("   ");
-        for (int i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-            vlog("\t  float%s", sizeNames[i]);
-    }
-    else
-    {
-        vlog("   \t                                        ");
-        if (gWimpyMode) vlog("   ");
-    }
+    vlog("   \t                                        ");
+    if (gWimpyMode) vlog("   ");
     if (!gSkipCorrectnessTesting) vlog("\t  max_ulps");
 
     vlog("\n-------------------------------------------------------------------"
@@ -905,8 +886,6 @@ static int ParseArgs(int argc, const char **argv)
                 optionFound = 1;
                 switch (*arg)
                 {
-                    case 'a': gReportAverageTimes ^= 1; break;
-
                     case 'c': gToggleCorrectlyRoundedDivideSqrt ^= 1; break;
 
                     case 'd': gHasDouble ^= 1; break;
@@ -927,8 +906,6 @@ static int ParseArgs(int argc, const char **argv)
 
                     case 's': gStopOnError ^= 1; break;
 
-                    case 't': gMeasureTimes ^= 1; break;
-
                     case 'v': gVerboseBruteForce ^= 1; break;
 
                     case 'w': // wimpy mode
@@ -970,7 +947,6 @@ static int ParseArgs(int argc, const char **argv)
                         gMinVectorSizeIndex = 4;
                         gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
                         break;
-                        break;
 
                     default:
                         vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg);
@@ -1053,9 +1029,8 @@ static void PrintFunctions(void)
 
 static void PrintUsage(void)
 {
-    vlog("%s [-acglstz]: <optional: math function names>\n", appName);
+    vlog("%s [-cglsz]: <optional: math function names>\n", appName);
     vlog("\toptions:\n");
-    vlog("\t\t-a\tReport average times instead of best times\n");
     vlog("\t\t-c\tToggle test fp correctly rounded divide and sqrt (Default: "
          "off)\n");
     vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 "
@@ -1070,7 +1045,6 @@ static void PrintUsage(void)
          "accuracy checks.)\n");
     vlog("\t\t-m\tToggle run multi-threaded. (Default: on) )\n");
     vlog("\t\t-s\tStop on error\n");
-    vlog("\t\t-t\tToggle timing  (on by default)\n");
     vlog("\t\t-w\tToggle Wimpy Mode, * Not a valid test * \n");
     vlog("\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
          "1-10, default factor(%u)\n",
@@ -1809,51 +1783,6 @@ float Abs_Error(float test, double reference)
     return fabs((float)(reference - (double)test));
 }
 
-#if defined(__APPLE__)
-#include <mach/mach_time.h>
-#endif
-
-uint64_t GetTime(void)
-{
-#if defined(__APPLE__)
-    return mach_absolute_time();
-#elif defined(_WIN32) && defined(_MSC_VER)
-    return ReadTime();
-#else
-// mach_absolute_time is a high precision timer with precision < 1 microsecond.
-#warning need accurate clock here.  Times are invalid.
-    return 0;
-#endif
-}
-
-
-#if defined(_WIN32) && defined(_MSC_VER)
-/* function is defined in "compat.h" */
-#else
-double SubtractTime(uint64_t endTime, uint64_t startTime)
-{
-    uint64_t diff = endTime - startTime;
-    static double conversion = 0.0;
-
-    if (0.0 == conversion)
-    {
-#if defined(__APPLE__)
-        mach_timebase_info_data_t info = { 0, 0 };
-        kern_return_t err = mach_timebase_info(&info);
-        if (0 == err)
-            conversion = 1e-9 * (double)info.numer / (double)info.denom;
-#else
-// This function consumes output from GetTime() above, and converts the time to
-// secionds.
-#warning need accurate ticks to seconds conversion factor here. Times are invalid.
-#endif
-    }
-
-    // strictly speaking we should also be subtracting out timer latency here
-    return conversion * (double)diff;
-}
-#endif
-
 cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
     if (0 == (x & (x - 1))) return x;
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 96a0e7bbb8..6e0327a283 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -723,109 +723,11 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        double *p3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-            p3[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
         vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
              maxErrorVal3);
+    }
+
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index 20b3eb7127..9ef59887b9 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -858,109 +858,11 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-            p3[j] = genrand_int32(d);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
         vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
              maxErrorVal3);
+    }
+
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index 615d0fb950..3dc1548ca8 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -290,85 +290,10 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-
-        if (strstr(f->name, "exp"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = (double)genrand_real1(d);
-        else if (strstr(f->name, "log"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = fabs(DoubleFromUInt32(genrand_int32(d)));
-        else
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
 
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 3666589230..73365d19d9 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -315,85 +315,10 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog(" (rlx skip correctness testing)\n");
             goto exit;
         }
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        if (strstr(f->name, "exp") || strstr(f->name, "sin")
-            || strstr(f->name, "cos") || strstr(f->name, "tan"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-                ((float *)p)[j] = (float)genrand_real1(d);
-        else if (strstr(f->name, "log"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-                p[j] = genrand_int32(d) & 0x7fffffff;
-        else
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-                p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
 
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index 77e66ca529..9402e24c1d 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -429,84 +429,11 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
 
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
         vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
              maxErrorVal1);
+    }
+
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index 70b24c7a94..d3fbf20b9a 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -564,85 +564,11 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
 
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
         vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
              maxErrorVal1);
+    }
+
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 150e5d8f7b..66bf414e6f 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -401,84 +401,10 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
 
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index 1312be01bc..f6647b2239 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -400,83 +400,10 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
 
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index d104fc88cf..c7034577f8 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -299,76 +299,10 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-
-        for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
 
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 4b453c871c..86389e4a3e 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -332,84 +332,10 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
             vlog("Wimp pass");
         else
             vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        if (strstr(f->name, "exp") || strstr(f->name, "sin")
-            || strstr(f->name, "cos") || strstr(f->name, "tan"))
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                ((float *)p)[j] = (float)genrand_real1(d);
-        else if (strstr(f->name, "log"))
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = genrand_int32(d) & 0x7fffffff;
-        else
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
 
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     }
 
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     vlog("\n");
 
 exit:
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index 894a067593..9ab915c7b1 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -59,8 +59,6 @@ extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT];
 extern uint32_t gComputeDevices;
 extern uint32_t gSimdSize;
 extern int gSkipCorrectnessTesting;
-extern int gMeasureTimes;
-extern int gReportAverageTimes;
 extern int gForceFTZ;
 extern int gFastRelaxedDerived;
 extern int gWimpyMode;
@@ -91,8 +89,6 @@ float Abs_Error(float test, double reference);
 float Ulp_Error(float test, double reference);
 float Bruteforce_Ulp_Error_Double(double test, long double reference);
 
-uint64_t GetTime(void);
-double SubtractTime(uint64_t endTime, uint64_t startTime);
 int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k,
                cl_program *p, bool relaxedMode);
 int MakeKernels(const char **c, cl_uint count, const char *name,
@@ -123,8 +119,6 @@ static inline double DoubleFromUInt32(uint32_t bits)
 void _LogBuildError(cl_program p, int line, const char *file);
 #define LogBuildError(program) _LogBuildError(program, __LINE__, __FILE__)
 
-#define PERF_LOOP_COUNT 100
-
 // The spec is fairly clear that we may enforce a hard cutoff to prevent
 // premature flushing to zero.
 // However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +

From 7286e06a948495a22833551fcd70776fd53b9845 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 15 Apr 2021 18:19:09 +0100
Subject: [PATCH 073/158] Make variables and functions local to translation
 unit (#1216)

* Make variables and functions local to translation unit

Make some global variables local to function, or remove them when
actually dead.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Address comments

Remove unused code.
Reduce scope of gDoubleCapabilities.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/main.cpp  | 59 +++++++++------------
 test_conformance/math_brute_force/utility.h |  5 --
 2 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index c2a376f166..4211ac9d80 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -51,21 +51,21 @@
     (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO                  \
      | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)
 
-const char **gTestNames = NULL;
-unsigned int gTestNameCount = 0;
-char appName[MAXPATHLEN] = "";
+static const char **gTestNames = NULL;
+static unsigned int gTestNameCount = 0;
+static char appName[MAXPATHLEN] = "";
 cl_device_id gDevice = NULL;
 cl_context gContext = NULL;
 cl_command_queue gQueue = NULL;
 static int32_t gStartTestNumber = -1;
 static int32_t gEndTestNumber = -1;
 int gSkipCorrectnessTesting = 0;
-int gStopOnError = 0;
+static int gStopOnError = 0;
 static bool gSkipRestOfTests;
 int gForceFTZ = 0;
 int gWimpyMode = 0;
-int gHasDouble = 0;
-int gTestFloat = 1;
+static int gHasDouble = 0;
+static int gTestFloat = 1;
 // This flag should be 'ON' by default and it can be changed through the command
 // line arguments.
 static int gTestFastRelaxed = 1;
@@ -78,7 +78,7 @@ static int gTestFastRelaxed = 1;
   OpenCL 2.0 spec then it has to be changed through a command line argument.
 */
 int gFastRelaxedDerived = 1;
-int gToggleCorrectlyRoundedDivideSqrt = 0;
+static int gToggleCorrectlyRoundedDivideSqrt = 0;
 int gDeviceILogb0 = 1;
 int gDeviceILogbNaN = 1;
 int gCheckTininessBeforeRounding = 1;
@@ -97,12 +97,8 @@ cl_mem gInBuffer2 = NULL;
 cl_mem gInBuffer3 = NULL;
 cl_mem gOutBuffer[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
 cl_mem gOutBuffer2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
-uint32_t gComputeDevices = 0;
-uint32_t gSimdSize = 1;
-uint32_t gDeviceFrequency = 0;
 static MTdata gMTdata;
 cl_device_fp_config gFloatCapabilities = 0;
-cl_device_fp_config gDoubleCapabilities = 0;
 int gWimpyReductionFactor = 32;
 int gWimpyBufferSize = BUFFER_SIZE;
 int gVerboseBruteForce = 0;
@@ -110,15 +106,14 @@ int gVerboseBruteForce = 0;
 static int ParseArgs(int argc, const char **argv);
 static void PrintUsage(void);
 static void PrintFunctions(void);
-test_status InitCL(cl_device_id device);
+static test_status InitCL(cl_device_id device);
 static void ReleaseCL(void);
 static int InitILogbConstants(void);
 static int IsTininessDetectedBeforeRounding(void);
 static int
 IsInRTZMode(void); // expensive. Please check gIsInRTZMode global instead.
 
-
-int doTest(const char *name)
+static int doTest(const char *name)
 {
     if (gSkipRestOfTests)
     {
@@ -747,7 +742,7 @@ int test_not(cl_device_id deviceID, cl_context context, cl_command_queue queue,
     return doTest("not");
 }
 
-test_definition test_list[] = {
+static test_definition test_list[] = {
     ADD_TEST(acos),          ADD_TEST(acosh),      ADD_TEST(acospi),
     ADD_TEST(asin),          ADD_TEST(asinh),      ADD_TEST(asinpi),
     ADD_TEST(atan),          ADD_TEST(atanh),      ADD_TEST(atanpi),
@@ -784,7 +779,7 @@ test_definition test_list[] = {
     ADD_TEST(not),
 };
 
-const int test_num = ARRAY_SIZE(test_list);
+static const int test_num = ARRAY_SIZE(test_list);
 
 #pragma mark -
 
@@ -1080,7 +1075,6 @@ test_status InitCL(cl_device_id device)
 {
     int error;
     uint32_t i;
-    size_t configSize = sizeof(gComputeDevices);
     cl_device_type device_type;
 
     error = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type),
@@ -1092,18 +1086,16 @@ test_status InitCL(cl_device_id device)
     }
 
     gDevice = device;
-    if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_COMPUTE_UNITS,
-                                 configSize, &gComputeDevices, NULL)))
-        gComputeDevices = 1;
 
     // Check extensions
     if (is_extension_available(gDevice, "cl_khr_fp64"))
     {
         gHasDouble ^= 1;
 #if defined(CL_DEVICE_DOUBLE_FP_CONFIG)
+        cl_device_fp_config doubleCapabilities = 0;
         if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_DOUBLE_FP_CONFIG,
-                                     sizeof(gDoubleCapabilities),
-                                     &gDoubleCapabilities, NULL)))
+                                     sizeof(doubleCapabilities),
+                                     &doubleCapabilities, NULL)))
         {
             vlog_error("ERROR: Unable to get device "
                        "CL_DEVICE_DOUBLE_FP_CONFIG. (%d)\n",
@@ -1112,19 +1104,19 @@ test_status InitCL(cl_device_id device)
         }
 
         if (DOUBLE_REQUIRED_FEATURES
-            != (gDoubleCapabilities & DOUBLE_REQUIRED_FEATURES))
+            != (doubleCapabilities & DOUBLE_REQUIRED_FEATURES))
         {
             std::string list;
-            if (0 == (gDoubleCapabilities & CL_FP_FMA)) list += "CL_FP_FMA, ";
-            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_NEAREST))
+            if (0 == (doubleCapabilities & CL_FP_FMA)) list += "CL_FP_FMA, ";
+            if (0 == (doubleCapabilities & CL_FP_ROUND_TO_NEAREST))
                 list += "CL_FP_ROUND_TO_NEAREST, ";
-            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_ZERO))
+            if (0 == (doubleCapabilities & CL_FP_ROUND_TO_ZERO))
                 list += "CL_FP_ROUND_TO_ZERO, ";
-            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_INF))
+            if (0 == (doubleCapabilities & CL_FP_ROUND_TO_INF))
                 list += "CL_FP_ROUND_TO_INF, ";
-            if (0 == (gDoubleCapabilities & CL_FP_INF_NAN))
+            if (0 == (doubleCapabilities & CL_FP_INF_NAN))
                 list += "CL_FP_INF_NAN, ";
-            if (0 == (gDoubleCapabilities & CL_FP_DENORM))
+            if (0 == (doubleCapabilities & CL_FP_DENORM))
                 list += "CL_FP_DENORM, ";
             vlog_error("ERROR: required double features are missing: %s\n",
                        list.c_str());
@@ -1138,10 +1130,11 @@ test_status InitCL(cl_device_id device)
 #endif
     }
 
-    configSize = sizeof(gDeviceFrequency);
+    uint32_t deviceFrequency = 0;
+    size_t configSize = sizeof(deviceFrequency);
     if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY,
-                                 configSize, &gDeviceFrequency, NULL)))
-        gDeviceFrequency = 0;
+                                 configSize, &deviceFrequency, NULL)))
+        deviceFrequency = 0;
 
     if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG,
                                  sizeof(gFloatCapabilities),
@@ -1288,7 +1281,7 @@ test_status InitCL(cl_device_id device)
     vlog("\tCL C Version: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DRIVER_VERSION, sizeof(c), &c, NULL);
     vlog("\tDriver Version: %s\n", c);
-    vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency);
+    vlog("\tDevice Frequency: %d MHz\n", deviceFrequency);
     vlog("\tSubnormal values supported for floats? %s\n",
          no_yes[0 != (CL_FP_DENORM & gFloatCapabilities)]);
     vlog("\tCorrectly rounded divide and sqrt supported for floats? %s\n",
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index 9ab915c7b1..0e11457c17 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -56,22 +56,17 @@ extern cl_mem gInBuffer2;
 extern cl_mem gInBuffer3;
 extern cl_mem gOutBuffer[VECTOR_SIZE_COUNT];
 extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT];
-extern uint32_t gComputeDevices;
-extern uint32_t gSimdSize;
 extern int gSkipCorrectnessTesting;
 extern int gForceFTZ;
 extern int gFastRelaxedDerived;
 extern int gWimpyMode;
-extern int gHasDouble;
 extern int gIsInRTZMode;
 extern int gInfNanSupport;
 extern int gIsEmbedded;
 extern int gVerboseBruteForce;
 extern uint32_t gMaxVectorSizeIndex;
 extern uint32_t gMinVectorSizeIndex;
-extern uint32_t gDeviceFrequency;
 extern cl_device_fp_config gFloatCapabilities;
-extern cl_device_fp_config gDoubleCapabilities;
 
 #define LOWER_IS_BETTER 0
 #define HIGHER_IS_BETTER 1

From 6683fa91a95d055b51a2c3236582bfc48adfbd96 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 20 Apr 2021 17:10:06 +0100
Subject: [PATCH 074/158] Use lambda to reduce amount of code (#1219)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/main.cpp | 515 +--------------------
 1 file changed, 15 insertions(+), 500 deletions(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 4211ac9d80..1cfc3c4359 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -241,506 +241,18 @@ static int doTest(const char *name)
     return error;
 }
 
-int test_acos(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("acos");
-}
-int test_acosh(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("acosh");
-}
-int test_acospi(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("acospi");
-}
-int test_asin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("asin");
-}
-int test_asinh(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("asinh");
-}
-int test_asinpi(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("asinpi");
-}
-int test_atan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("atan");
-}
-int test_atanh(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("atanh");
-}
-int test_atanpi(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("atanpi");
-}
-int test_atan2(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("atan2");
-}
-int test_atan2pi(cl_device_id deviceID, cl_context context,
-                 cl_command_queue queue, int num_elements)
-{
-    return doTest("atan2pi");
-}
-int test_cbrt(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("cbrt");
-}
-int test_ceil(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("ceil");
-}
-int test_copysign(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("copysign");
-}
-int test_cos(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("cos");
-}
-int test_cosh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("cosh");
-}
-int test_cospi(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("cospi");
-}
-int test_exp(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("exp");
-}
-int test_exp2(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("exp2");
-}
-int test_exp10(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("exp10");
-}
-int test_expm1(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("expm1");
-}
-int test_fabs(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("fabs");
-}
-int test_fdim(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("fdim");
-}
-int test_floor(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("floor");
-}
-int test_fma(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("fma");
-}
-int test_fmax(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("fmax");
-}
-int test_fmin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("fmin");
-}
-int test_fmod(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("fmod");
-}
-int test_fract(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("fract");
-}
-int test_frexp(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("frexp");
-}
-int test_hypot(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("hypot");
-}
-int test_ilogb(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("ilogb");
-}
-int test_isequal(cl_device_id deviceID, cl_context context,
-                 cl_command_queue queue, int num_elements)
-{
-    return doTest("isequal");
-}
-int test_isfinite(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("isfinite");
-}
-int test_isgreater(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("isgreater");
-}
-int test_isgreaterequal(cl_device_id deviceID, cl_context context,
-                        cl_command_queue queue, int num_elements)
-{
-    return doTest("isgreaterequal");
-}
-int test_isinf(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("isinf");
-}
-int test_isless(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("isless");
-}
-int test_islessequal(cl_device_id deviceID, cl_context context,
-                     cl_command_queue queue, int num_elements)
-{
-    return doTest("islessequal");
-}
-int test_islessgreater(cl_device_id deviceID, cl_context context,
-                       cl_command_queue queue, int num_elements)
-{
-    return doTest("islessgreater");
-}
-int test_isnan(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("isnan");
-}
-int test_isnormal(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("isnormal");
-}
-int test_isnotequal(cl_device_id deviceID, cl_context context,
-                    cl_command_queue queue, int num_elements)
-{
-    return doTest("isnotequal");
-}
-int test_isordered(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("isordered");
-}
-int test_isunordered(cl_device_id deviceID, cl_context context,
-                     cl_command_queue queue, int num_elements)
-{
-    return doTest("isunordered");
-}
-int test_ldexp(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("ldexp");
-}
-int test_lgamma(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("lgamma");
-}
-int test_lgamma_r(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("lgamma_r");
-}
-int test_log(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("log");
-}
-int test_log2(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("log2");
-}
-int test_log10(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("log10");
-}
-int test_log1p(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("log1p");
-}
-int test_logb(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("logb");
-}
-int test_mad(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("mad");
-}
-int test_maxmag(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("maxmag");
-}
-int test_minmag(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("minmag");
-}
-int test_modf(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("modf");
-}
-int test_nan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("nan");
-}
-int test_nextafter(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("nextafter");
-}
-int test_pow(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("pow");
-}
-int test_pown(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("pown");
-}
-int test_powr(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("powr");
-}
-int test_remainder(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("remainder");
-}
-int test_remquo(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("remquo");
-}
-int test_rint(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("rint");
-}
-int test_rootn(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("rootn");
-}
-int test_round(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("round");
-}
-int test_rsqrt(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("rsqrt");
-}
-int test_signbit(cl_device_id deviceID, cl_context context,
-                 cl_command_queue queue, int num_elements)
-{
-    return doTest("signbit");
-}
-int test_sin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("sin");
-}
-int test_sincos(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("sincos");
-}
-int test_sinh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("sinh");
-}
-int test_sinpi(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("sinpi");
-}
-int test_sqrt(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("sqrt");
-}
-int test_sqrt_cr(cl_device_id deviceID, cl_context context,
-                 cl_command_queue queue, int num_elements)
-{
-    return doTest("sqrt_cr");
-}
-int test_tan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("tan");
-}
-int test_tanh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-              int num_elements)
-{
-    return doTest("tanh");
-}
-int test_tanpi(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("tanpi");
-}
-int test_trunc(cl_device_id deviceID, cl_context context,
-               cl_command_queue queue, int num_elements)
-{
-    return doTest("trunc");
-}
-int test_half_cos(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("half_cos");
-}
-int test_half_divide(cl_device_id deviceID, cl_context context,
-                     cl_command_queue queue, int num_elements)
-{
-    return doTest("half_divide");
-}
-int test_half_exp(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("half_exp");
-}
-int test_half_exp2(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("half_exp2");
-}
-int test_half_exp10(cl_device_id deviceID, cl_context context,
-                    cl_command_queue queue, int num_elements)
-{
-    return doTest("half_exp10");
-}
-int test_half_log(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("half_log");
-}
-int test_half_log2(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("half_log2");
-}
-int test_half_log10(cl_device_id deviceID, cl_context context,
-                    cl_command_queue queue, int num_elements)
-{
-    return doTest("half_log10");
-}
-int test_half_powr(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("half_powr");
-}
-int test_half_recip(cl_device_id deviceID, cl_context context,
-                    cl_command_queue queue, int num_elements)
-{
-    return doTest("half_recip");
-}
-int test_half_rsqrt(cl_device_id deviceID, cl_context context,
-                    cl_command_queue queue, int num_elements)
-{
-    return doTest("half_rsqrt");
-}
-int test_half_sin(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("half_sin");
-}
-int test_half_sqrt(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("half_sqrt");
-}
-int test_half_tan(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("half_tan");
-}
-int test_add(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("add");
-}
-int test_subtract(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("subtract");
-}
-int test_divide(cl_device_id deviceID, cl_context context,
-                cl_command_queue queue, int num_elements)
-{
-    return doTest("divide");
-}
-int test_divide_cr(cl_device_id deviceID, cl_context context,
-                   cl_command_queue queue, int num_elements)
-{
-    return doTest("divide_cr");
-}
-int test_multiply(cl_device_id deviceID, cl_context context,
-                  cl_command_queue queue, int num_elements)
-{
-    return doTest("multiply");
-}
-int test_assignment(cl_device_id deviceID, cl_context context,
-                    cl_command_queue queue, int num_elements)
-{
-    return doTest("assignment");
-}
-int test_not(cl_device_id deviceID, cl_context context, cl_command_queue queue,
-             int num_elements)
-{
-    return doTest("not");
-}
+
+#define TEST_LAMBDA(name)                                                      \
+    [](cl_device_id, cl_context, cl_command_queue, int) {                      \
+        return doTest(#name);                                                  \
+    }
+
+// Redefine ADD_TEST to use TEST_LAMBDA.
+#undef ADD_TEST
+#define ADD_TEST(name)                                                         \
+    {                                                                          \
+        TEST_LAMBDA(name), #name, Version(1, 0)                                \
+    }
 
 static test_definition test_list[] = {
     ADD_TEST(acos),          ADD_TEST(acosh),      ADD_TEST(acospi),
@@ -779,6 +291,9 @@ static test_definition test_list[] = {
     ADD_TEST(not),
 };
 
+#undef ADD_TEST
+#undef TEST_LAMBDA
+
 static const int test_num = ARRAY_SIZE(test_list);
 
 #pragma mark -

From 7542ae2443faaba69eab092c6311e9b11349a29c Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 20 Apr 2021 17:10:29 +0100
Subject: [PATCH 075/158] Fold code into loop (#1218)

Use one loop to read buffers from device, with the last read operation
blocking until complete.

This pattern cannot be elegantly refactored just yet, mainly for two
reasons:
 - Some tests use goto statements to clean their resources.
 - Some tests (not modified in this patch) only use blocking read
   operations.

Once code is further unified, this pattern can be refactored into a
helper function.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp        | 19 +++++-------------
 .../math_brute_force/binary_float.cpp         | 19 +++++-------------
 .../math_brute_force/binary_i_double.cpp      | 19 +++++-------------
 .../math_brute_force/binary_i_float.cpp       | 19 +++++-------------
 .../binary_operator_double.cpp                | 19 +++++-------------
 .../binary_operator_float.cpp                 | 19 +++++-------------
 .../math_brute_force/macro_binary_double.cpp  | 20 +++++--------------
 .../math_brute_force/macro_binary_float.cpp   | 20 +++++--------------
 .../math_brute_force/macro_unary_double.cpp   | 19 +++++-------------
 .../math_brute_force/macro_unary_float.cpp    | 19 +++++-------------
 .../math_brute_force/unary_double.cpp         | 19 +++++-------------
 .../math_brute_force/unary_float.cpp          | 19 +++++-------------
 12 files changed, 60 insertions(+), 170 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index e51327dfcd..cc46ebd375 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -636,12 +636,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     for (j = 0; j < buffer_elements; j++)
         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -651,16 +652,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     // Verify data
     t = (cl_ulong *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 5a37a40771..6bea0c9a8d 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -684,12 +684,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     if (isFDim && ftz) RestoreFPState(&oldMode);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -699,16 +700,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     if (!skipVerification)
     {
         // Verify data
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index c3c3eb1366..7e19b24392 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -635,12 +635,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     for (j = 0; j < buffer_elements; j++)
         r[j] = (cl_double)func.f_fi(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -650,16 +651,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     // Verify data
     t = (cl_ulong *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index 93cb910b34..08790c9d2c 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -627,12 +627,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
     for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -642,16 +643,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     // Verify data
     t = (cl_uint *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 09540b0374..dc1edf96d3 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -629,12 +629,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     for (j = 0; j < buffer_elements; j++)
         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -644,16 +645,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     // Verify data
     t = (cl_ulong *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index a59be16316..bc0376546f 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -674,12 +674,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     if (ftz) RestoreFPState(&oldMode);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -689,16 +690,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     // Verify data
     t = (cl_uint *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 7c40a18b75..4085bbd5bc 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -596,13 +596,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
     for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_ff(s[j], s2[j]);
 
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_long *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -612,16 +612,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     // Verify data
     t = (cl_long *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index 12c14dc5a6..7c51895adf 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -589,13 +589,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     s2 = (float *)gIn2 + thread_id * buffer_elements;
     for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
 
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_int *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -605,16 +605,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                          0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
     // Verify data
     t = (cl_int *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 4fadbfc35b..d2501e228c 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -413,12 +413,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     cl_double *s = (cl_double *)p;
     for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_long *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -428,16 +429,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
     // Verify data
     cl_long *t = (cl_long *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 4e72748ae7..94a315d064 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -416,12 +416,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     float *s = (float *)p;
     for (j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_int *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -431,16 +432,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                          0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
     // Verify data
     cl_int *t = (cl_int *)r;
     for (j = 0; j < buffer_elements; j++)
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index 3dc1548ca8..3b2fe4738b 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -436,12 +436,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     cl_double *s = (cl_double *)p;
     for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -450,16 +451,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
     }
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
 
     // Verify data
     cl_ulong *t = (cl_ulong *)r;
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 73365d19d9..83d5e807f6 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -491,12 +491,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     float *s = (float *)p;
     for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -506,16 +507,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
     // Verify data
     uint32_t *t = (uint32_t *)r;
     for (j = 0; j < buffer_elements; j++)

From b8d1ea9962a7bb443e845f5f035b254172fc4ae7 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 21 Apr 2021 10:45:34 +0100
Subject: [PATCH 076/158] Fold constant global variable (#1217)

gWimpyBufferSize is never modified and is actually not used to modify
the number of tests -- gWimpyReductionFactor is used for that purpose by
some tests, but not all.

This patch removes this unnecessary global variable to simplify the
codebase, and reduce differences between tests.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp        |  7 ----
 .../math_brute_force/binary_float.cpp         |  7 ----
 .../math_brute_force/binary_i_double.cpp      |  7 ----
 .../math_brute_force/binary_i_float.cpp       |  7 ----
 .../binary_operator_double.cpp                |  7 ----
 .../binary_operator_float.cpp                 |  7 ----
 .../binary_two_results_i_double.cpp           | 37 +++++++++----------
 .../binary_two_results_i_float.cpp            | 37 +++++++++----------
 .../math_brute_force/i_unary_double.cpp       | 30 +++++++--------
 .../math_brute_force/i_unary_float.cpp        | 27 +++++++-------
 .../math_brute_force/macro_binary_double.cpp  |  7 ----
 .../math_brute_force/macro_binary_float.cpp   |  7 ----
 .../math_brute_force/macro_unary_double.cpp   |  7 ----
 .../math_brute_force/macro_unary_float.cpp    |  7 ----
 .../math_brute_force/mad_double.cpp           | 23 ++++++------
 .../math_brute_force/mad_float.cpp            | 23 ++++++------
 test_conformance/math_brute_force/main.cpp    |  1 -
 .../math_brute_force/ternary_double.cpp       | 31 ++++++++--------
 .../math_brute_force/ternary_float.cpp        | 33 ++++++++---------
 .../math_brute_force/unary_double.cpp         |  6 ---
 .../math_brute_force/unary_float.cpp          |  7 ----
 .../unary_two_results_double.cpp              | 36 +++++++++---------
 .../unary_two_results_float.cpp               | 37 +++++++++----------
 .../unary_two_results_i_double.cpp            | 36 +++++++++---------
 .../unary_two_results_i_float.cpp             | 35 +++++++++---------
 .../math_brute_force/unary_u_double.cpp       | 21 +++++------
 .../math_brute_force/unary_u_float.cpp        | 25 ++++++-------
 test_conformance/math_brute_force/utility.h   |  1 -
 28 files changed, 210 insertions(+), 306 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index cc46ebd375..9ecad99a37 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -300,13 +300,6 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 6bea0c9a8d..4186f1c4f0 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -291,13 +291,6 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 7e19b24392..6a4ecfc43d 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -302,13 +302,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index 08790c9d2c..1e7103c28f 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -294,13 +294,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index dc1edf96d3..740c9d7f90 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -298,13 +298,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index bc0376546f..921f7101c5 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -288,13 +288,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index f548486f04..e6243754d8 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -178,8 +178,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
+    uint64_t step = getTestStep(sizeof(double), BUFFER_SIZE);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -204,21 +203,21 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
@@ -228,20 +227,20 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
                 goto exit;
             }
 
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+            if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
+                                              0, BUFFER_SIZE, gOut2[j], 0, NULL,
+                                              NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
                            error, j);
@@ -253,8 +252,8 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -304,7 +303,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             cri.r = (double *)gOut_Ref;
             cri.i = (int *)gOut_Ref2;
             cri.f_ffpI = f->dfunc.f_ffpI;
-            cri.lim = bufferSize / sizeof(double);
+            cri.lim = BUFFER_SIZE / sizeof(double);
             cri.count = (cri.lim + threadCount - 1) / threadCount;
             ThreadPool_Do(ReferenceD, threadCount, &cri);
         }
@@ -312,7 +311,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         {
             double *r = (double *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < bufferSize / sizeof(double); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
                 r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
         }
 
@@ -321,14 +320,14 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
@@ -340,7 +339,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -549,7 +548,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index 01c3ca7df8..476ae570d6 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -177,8 +177,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     int64_t maxError2 = 0;
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
+    uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
 
     cl_uint threadCount = GetThreadCount();
 
@@ -205,21 +204,21 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         // Init input array
         cl_uint *p = (cl_uint *)gIn;
         cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
@@ -229,20 +228,20 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
                 goto exit;
             }
 
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+            if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
+                                              0, BUFFER_SIZE, gOut2[j], 0, NULL,
+                                              NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
                            error, j);
@@ -254,8 +253,8 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -305,7 +304,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             cri.r = (float *)gOut_Ref;
             cri.i = (int *)gOut_Ref2;
             cri.f_ffpI = f->func.f_ffpI;
-            cri.lim = bufferSize / sizeof(float);
+            cri.lim = BUFFER_SIZE / sizeof(float);
             cri.count = (cri.lim + threadCount - 1) / threadCount;
             ThreadPool_Do(ReferenceF, threadCount, &cri);
         }
@@ -313,7 +312,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             float *r = (float *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
         }
 
@@ -322,14 +321,14 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
@@ -341,7 +340,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -534,7 +533,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index 040f8e65dc..451e43e8a5 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -125,9 +125,9 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || gForceFTZ;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
+    uint64_t step = getTestStep(sizeof(cl_double), BUFFER_SIZE);
+    int scale =
+        (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -154,17 +154,17 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -174,10 +174,10 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -188,9 +188,9 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -219,7 +219,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         double *s = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
             r[j] = f->dfunc.i_f(s[j]);
 
         // Read the data back
@@ -227,7 +227,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
@@ -238,7 +238,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -270,7 +270,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index 61ce9a236a..8883d7a1b2 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -123,9 +123,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
+    int scale = (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(float)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
@@ -152,17 +151,17 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         cl_uint *p = (cl_uint *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (cl_uint)i + j * scale;
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -172,10 +171,10 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -187,8 +186,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -217,7 +216,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         float *s = (float *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = f->func.i_f(s[j]);
 
         // Read the data back
@@ -225,7 +224,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
@@ -236,7 +235,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -268,7 +267,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 4085bbd5bc..99c71b68dc 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -284,13 +284,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index 7c51895adf..44050b7c19 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -274,13 +274,6 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index d2501e228c..49852a85fd 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -162,13 +162,6 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 94a315d064..116f8d74c3 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -161,13 +161,6 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index a5f591838a..aa24507329 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -141,8 +141,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
+    uint64_t step = getTestStep(sizeof(double), BUFFER_SIZE);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -162,7 +161,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
@@ -170,21 +169,21 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn3, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
@@ -194,10 +193,10 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -209,8 +208,8 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -253,7 +252,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
             r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
@@ -261,7 +260,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
index ff4e99d0dc..5d99eb0e93 100644
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -142,8 +142,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
+    uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
 
     // Init the kernels
     {
@@ -161,7 +160,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         cl_uint *p = (cl_uint *)gIn;
         cl_uint *p2 = (cl_uint *)gIn2;
         cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
@@ -169,21 +168,21 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn3, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
@@ -193,10 +192,10 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -208,8 +207,8 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -252,7 +251,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
         float *s3 = (float *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
@@ -260,7 +259,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 1cfc3c4359..7b06ca87aa 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -100,7 +100,6 @@ cl_mem gOutBuffer2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
 static MTdata gMTdata;
 cl_device_fp_config gFloatCapabilities = 0;
 int gWimpyReductionFactor = 32;
-int gWimpyBufferSize = BUFFER_SIZE;
 int gVerboseBruteForce = 0;
 
 static int ParseArgs(int argc, const char **argv);
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 6e0327a283..02d06fe315 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -218,8 +218,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
+    uint64_t step = getTestStep(sizeof(double), BUFFER_SIZE);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -246,7 +245,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         { // test edge cases
             uint32_t x, y, z;
             x = y = z = 0;
-            for (; j < bufferSize / sizeof(double); j++)
+            for (; j < BUFFER_SIZE / sizeof(double); j++)
             {
                 p[j] = specialValues[x];
                 p2[j] = specialValues[y];
@@ -261,11 +260,11 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                     }
                 }
             }
-            if (j == bufferSize / sizeof(double))
+            if (j == BUFFER_SIZE / sizeof(double))
                 vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for (; j < bufferSize / sizeof(double); j++)
+        for (; j < BUFFER_SIZE / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
@@ -273,21 +272,21 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn3, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
@@ -297,10 +296,10 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -312,8 +311,8 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -356,7 +355,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
             r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
@@ -364,7 +363,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
@@ -375,7 +374,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
 
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -707,7 +706,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index 9ef59887b9..5ad564aae3 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -228,8 +228,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
+    uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
 
     cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
 
@@ -265,7 +264,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             float *fp3 = (float *)gIn3;
             uint32_t x, y, z;
             x = y = z = 0;
-            for (; j < bufferSize / sizeof(float); j++)
+            for (; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 fp[j] = specialValues[x];
                 fp2[j] = specialValues[y];
@@ -281,11 +280,11 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                     }
                 }
             }
-            if (j == bufferSize / sizeof(float))
+            if (j == BUFFER_SIZE / sizeof(float))
                 vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for (; j < bufferSize / sizeof(float); j++)
+        for (; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
@@ -293,21 +292,21 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn3, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
@@ -317,10 +316,10 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -332,8 +331,8 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -378,7 +377,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         float *s3 = (float *)gIn3;
         if (skipNanInf)
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 feclearexcept(FE_OVERFLOW);
                 r[j] =
@@ -389,7 +388,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 r[j] =
                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
         }
@@ -399,7 +398,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
@@ -410,7 +409,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -842,7 +841,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index 3b2fe4738b..fe3edef7a4 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -170,12 +170,6 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
 
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 83d5e807f6..a0e45a2a31 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -171,13 +171,6 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index 9402e24c1d..6f3a080bad 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -136,9 +136,9 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal0 = 0.0f;
     double maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
+    uint64_t step = getTestStep(sizeof(cl_double), BUFFER_SIZE);
+    int scale =
+        (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -160,16 +160,16 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -179,20 +179,20 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
                 goto exit;
             }
 
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+            if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
+                                              0, BUFFER_SIZE, gOut2[j], 0, NULL,
+                                              NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
                            error, j);
@@ -204,7 +204,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -240,7 +240,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         double *r = (double *)gOut_Ref;
         double *r2 = (double *)gOut_Ref2;
         double *s = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             long double dd;
             r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
@@ -252,14 +252,14 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
@@ -271,7 +271,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         uint64_t *t2 = (uint64_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -413,7 +413,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index d3fbf20b9a..f3cf032d68 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -136,9 +136,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal0 = 0.0f;
     float maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
+    int scale = (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(float)) + 1);
     cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
     int isFract = 0 == strcmp("fract", f->nameInCode);
     int skipNanInf = isFract && !gInfNanSupport;
@@ -162,7 +161,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         uint32_t *p = (uint32_t *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 p[j] = (uint32_t)i + j * scale;
                 if (relaxedMode && strcmp(f->name, "sincos") == 0)
@@ -174,7 +173,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 p[j] = (uint32_t)i + j;
                 if (relaxedMode && strcmp(f->name, "sincos") == 0)
@@ -186,7 +185,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -196,20 +195,20 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
                 goto exit;
             }
 
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+            if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
+                                              0, BUFFER_SIZE, gOut2[j], 0, NULL,
+                                              NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
                            error, j);
@@ -221,7 +220,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -273,7 +272,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 
         if (skipNanInf)
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 double dd;
                 feclearexcept(FE_OVERFLOW);
@@ -290,7 +289,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 double dd;
                 if (relaxedMode)
@@ -309,14 +308,14 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
@@ -332,7 +331,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         uint32_t *t2 = (uint32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -548,7 +547,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 66bf414e6f..df1a5aa8d1 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -144,9 +144,9 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
+    uint64_t step = getTestStep(sizeof(cl_double), BUFFER_SIZE);
+    int scale =
+        (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -168,16 +168,16 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -187,20 +187,20 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
                 goto exit;
             }
 
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+            if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
+                                              0, BUFFER_SIZE, gOut2[j], 0, NULL,
+                                              NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
                            error, j);
@@ -212,7 +212,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -248,7 +248,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         double *r = (double *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         double *s = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
             r[j] = (double)f->dfunc.f_fpI(s[j], r2 + j);
 
         // Read the data back
@@ -256,14 +256,14 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
@@ -275,7 +275,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -385,7 +385,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index f6647b2239..c95ee061e4 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -141,9 +141,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
+    int scale = (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(float)) + 1);
     cl_ulong maxiError;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
@@ -172,16 +171,16 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         uint32_t *p = (uint32_t *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -191,20 +190,20 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
                 goto exit;
             }
 
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+            if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
+                                              0, BUFFER_SIZE, gOut2[j], 0, NULL,
+                                              NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
                            error, j);
@@ -216,7 +215,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -252,7 +251,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         float *r = (float *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         float *s = (float *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = (float)f->func.f_fpI(s[j], r2 + j);
 
         // Read the data back
@@ -260,14 +259,14 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
@@ -279,7 +278,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -384,7 +383,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index c7034577f8..5f252614cc 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -134,8 +134,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
+    uint64_t step = getTestStep(sizeof(cl_double), BUFFER_SIZE);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -155,10 +154,10 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     {
         // Init input array
         cl_ulong *p = (cl_ulong *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_ulong); j++) p[j] = random64(d);
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -168,10 +167,10 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -183,7 +182,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -212,7 +211,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         cl_ulong *s = (cl_ulong *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
             r[j] = (double)f->dfunc.f_u(s[j]);
 
         // Read the data back
@@ -220,7 +219,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
@@ -231,7 +230,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -283,7 +282,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 86389e4a3e..18534dc57b 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -126,9 +126,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
+    uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
+    int scale = (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(double)) + 1);
     int isRangeLimited = 0;
     float half_sin_cos_tan_limit = 0;
 
@@ -172,16 +171,16 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         uint32_t *p = (uint32_t *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -191,10 +190,10 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
@@ -206,7 +205,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -235,7 +234,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         cl_uint *s = (cl_uint *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = (float)f->func.f_u(s[j]);
 
         // Read the data back
@@ -243,7 +242,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 goto exit;
@@ -255,7 +254,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
@@ -316,7 +315,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                     BUFFER_SIZE);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index 0e11457c17..ac4db9c8de 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -34,7 +34,6 @@
 
 struct Func;
 
-extern int gWimpyBufferSize;
 extern int gWimpyReductionFactor;
 
 #define VECTOR_SIZE_COUNT 6

From edc36b5d53c9b1cb6cd8f293fb461fc1151f1ace Mon Sep 17 00:00:00 2001
From: Chetan Mistry <70694498+chemis01@users.noreply.github.com>
Date: Tue, 27 Apr 2021 15:43:39 +0100
Subject: [PATCH 077/158] Remove NULL Platform Check for Negative
 clGetPlatformInfo (#1222) (#1229)

Part of the negative CTS test for clGetPlatformInfo
incorrectly tested with a NULL platform. As this is a
special case it meant that the test was not correct.
This change removes the incorrect part of the test so that
it conforms with the OpenCL specification.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>
---
 test_conformance/api/negative_platform.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/test_conformance/api/negative_platform.cpp b/test_conformance/api/negative_platform.cpp
index d41b35fee2..7d9de5df87 100644
--- a/test_conformance/api/negative_platform.cpp
+++ b/test_conformance/api/negative_platform.cpp
@@ -42,15 +42,7 @@ int test_negative_get_platform_info(cl_device_id deviceID, cl_context context,
 {
     cl_platform_id platform = getPlatformFromDevice(deviceID);
 
-    cl_int err = clGetPlatformInfo(nullptr, CL_PLATFORM_VERSION, sizeof(char*),
-                                   nullptr, nullptr);
-    test_failure_error_ret(
-        err, CL_INVALID_PLATFORM,
-        "clGetPlatformInfo should return CL_INVALID_PLATFORM  when: \"platform "
-        "is not a valid platform\" using a nullptr",
-        TEST_FAIL);
-
-    err =
+    cl_int err =
         clGetPlatformInfo(reinterpret_cast<cl_platform_id>(deviceID),
                           CL_PLATFORM_VERSION, sizeof(char*), nullptr, nullptr);
     test_failure_error_ret(

From cba7a8a537009fa9aca392bfb4078479ea068ccf Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 28 Apr 2021 09:30:25 +0100
Subject: [PATCH 078/158] Remove dead CMake code (#1230)

Remove CMake statements that have no effect in math_brute_force.

set_source_files_properties without a list of files has no effect.

When MODULE_NAME is FOO,
set_source_files_properties(${MODULE_NAME}_SOURCES ...) sets the
properties of "FOO_SOURCES", not of the files listed in the variable
named FOO_SOURCES.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/CMakeLists.txt | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index 96433945eb..d8dfc40322 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -34,15 +34,4 @@ set(${MODULE_NAME}_SOURCES
     utility.cpp
 )
 
-if (NOT CMAKE_CL_64 AND NOT MSVC AND NOT ANDROID)
-set_source_files_properties(
-${MODULE_NAME}_SOURCES
-    COMPILE_FLAGS -march=i686)
-endif (NOT CMAKE_CL_64 AND NOT MSVC AND NOT ANDROID)
-
-if(CMAKE_COMPILER_IS_GNUCC)
-set_source_files_properties(
-        COMPILE_FLAGS -O0)
-endif(CMAKE_COMPILER_IS_GNUCC)
-
 include(../CMakeCommon.txt)

From 01497c402e03e536468cf7acc1504ecfe7ddce36 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 28 Apr 2021 09:30:51 +0100
Subject: [PATCH 079/158] Reduce scope of variables (#1228)

Make variables local to loops, with appropriate types. These variables
are not read after the loop without being reset first, so this patch
doesn't change behaviour.

These variables should now be used for one purpose only, making it
easier to reason about the code. This will make future refactoring
easier.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp        | 50 +++++++-------
 .../math_brute_force/binary_float.cpp         | 53 +++++++-------
 .../math_brute_force/binary_i_double.cpp      | 50 +++++++-------
 .../math_brute_force/binary_i_float.cpp       | 52 +++++++-------
 .../binary_operator_double.cpp                | 50 +++++++-------
 .../binary_operator_float.cpp                 | 69 +++++++++----------
 .../binary_two_results_i_double.cpp           | 23 +++----
 .../binary_two_results_i_float.cpp            | 23 +++----
 .../math_brute_force/i_unary_double.cpp       | 22 +++---
 .../math_brute_force/i_unary_float.cpp        | 22 +++---
 .../math_brute_force/macro_binary_double.cpp  | 48 +++++++------
 .../math_brute_force/macro_binary_float.cpp   | 48 +++++++------
 .../math_brute_force/macro_unary_double.cpp   | 32 ++++-----
 .../math_brute_force/macro_unary_float.cpp    | 35 +++++-----
 .../math_brute_force/mad_double.cpp           | 16 ++---
 .../math_brute_force/mad_float.cpp            | 16 ++---
 .../math_brute_force/ternary_double.cpp       | 39 +++++------
 .../math_brute_force/ternary_float.cpp        | 41 ++++++-----
 .../math_brute_force/unary_double.cpp         | 35 +++++-----
 .../math_brute_force/unary_float.cpp          | 34 +++++----
 .../unary_two_results_double.cpp              | 22 +++---
 .../unary_two_results_float.cpp               | 24 +++----
 .../unary_two_results_i_double.cpp            | 22 +++---
 .../unary_two_results_i_float.cpp             | 22 +++---
 .../math_brute_force/unary_u_double.cpp       | 21 +++---
 .../math_brute_force/unary_u_float.cpp        | 22 +++---
 26 files changed, 420 insertions(+), 471 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 9ecad99a37..4baa499180 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -286,7 +286,6 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     double maxErrorVal2 = 0.0;
@@ -321,7 +320,7 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -344,7 +343,7 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_double),
@@ -371,7 +370,7 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -413,7 +412,7 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -437,12 +436,12 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -450,12 +449,12 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -477,7 +476,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
 
@@ -492,7 +490,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -511,11 +509,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
-    j = 0;
+    cl_uint idx = 0;
     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_double *fp2 = (cl_double *)p2;
@@ -524,10 +522,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            fp[j] = specialValues[x];
-            fp2[j] = specialValues[y];
+            fp[idx] = specialValues[x];
+            fp2[idx] = specialValues[y];
             if (++x >= specialValuesCount)
             {
                 x = 0;
@@ -538,10 +536,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        p[j] = genrand_int64(d);
-        p2[j] = genrand_int64(d);
+        p[idx] = genrand_int64(d);
+        p2[idx] = genrand_int64(d);
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -558,7 +556,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -626,12 +624,12 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
     s = (cl_double *)gIn + thread_id * buffer_elements;
     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
@@ -647,9 +645,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
@@ -794,7 +792,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 4186f1c4f0..80ddba6fd3 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -276,7 +276,6 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     double maxErrorVal2 = 0.0;
@@ -313,7 +312,7 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -336,7 +335,7 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_float),
@@ -363,7 +362,7 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -405,7 +404,7 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -429,12 +428,12 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -442,12 +441,12 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -470,7 +469,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
     const char *name = job->f->name;
@@ -498,7 +496,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -517,12 +515,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-
+    cl_uint idx = 0;
     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     { // test edge cases
         float *fp = (float *)p;
         float *fp2 = (float *)p2;
@@ -531,10 +528,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            fp[j] = specialValues[x];
-            fp2[j] = specialValues[y];
+            fp[idx] = specialValues[x];
+            fp2[idx] = specialValues[y];
             ++x;
             if (x >= specialValuesCount)
             {
@@ -546,10 +543,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
+        p[idx] = genrand_int32(d);
+        p2[idx] = genrand_int32(d);
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -566,7 +563,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -661,7 +658,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     s2 = (float *)gIn2 + thread_id * buffer_elements;
     if (skipNanInf)
     {
-        for (j = 0; j < buffer_elements; j++)
+        for (size_t j = 0; j < buffer_elements; j++)
         {
             feclearexcept(FE_OVERFLOW);
             r[j] = (float)ref_func(s[j], s2[j]);
@@ -671,7 +668,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
     else
     {
-        for (j = 0; j < buffer_elements; j++)
+        for (size_t j = 0; j < buffer_elements; j++)
             r[j] = (float)ref_func(s[j], s2[j]);
     }
 
@@ -679,7 +676,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
@@ -697,9 +694,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     {
         // Verify data
         t = (cl_uint *)r;
-        for (j = 0; j < buffer_elements; j++)
+        for (size_t j = 0; j < buffer_elements; j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 cl_uint *q = out[k];
 
@@ -956,7 +953,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 6a4ecfc43d..69e620aaa5 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -288,7 +288,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     cl_int maxErrorVal2 = 0;
@@ -319,7 +318,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -342,7 +341,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_double),
@@ -372,7 +371,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -414,7 +413,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -438,12 +437,12 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -451,12 +450,12 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -478,7 +477,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
     cl_ulong *t;
@@ -491,7 +489,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -510,11 +508,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements;
-    j = 0;
+    size_t idx = 0;
     int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_int *ip2 = (cl_int *)p2;
@@ -523,10 +521,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            fp[j] = specialValues[x];
-            ip2[j] = specialValuesInt[y];
+            fp[idx] = specialValues[x];
+            ip2[idx] = specialValuesInt[y];
             if (++x >= specialValuesCount)
             {
                 x = 0;
@@ -537,10 +535,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        p[j] = DoubleFromUInt32(genrand_int32(d));
-        p2[j] = genrand_int32(d);
+        p[idx] = DoubleFromUInt32(genrand_int32(d));
+        p2[idx] = genrand_int32(d);
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -557,7 +555,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -625,12 +623,12 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
     s = (cl_double *)gIn + thread_id * buffer_elements;
     s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
         r[j] = (cl_double)func.f_fi(s[j], s2[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
@@ -646,9 +644,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
@@ -713,7 +711,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index 1e7103c28f..e65a9aaffc 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -280,7 +280,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     cl_int maxErrorVal2 = 0;
@@ -312,7 +311,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -335,7 +334,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_float),
@@ -365,7 +364,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -407,7 +406,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -431,12 +430,12 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -444,12 +443,12 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -471,7 +470,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     int ftz = job->ftz;
     float ulps = job->ulps;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
     cl_uint *t = 0;
@@ -482,7 +480,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -501,12 +499,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-
+    size_t idx = 0;
     int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     { // test edge cases
         float *fp = (float *)p;
         cl_int *ip2 = (cl_int *)p2;
@@ -515,10 +512,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            fp[j] = specialValues[x];
-            ip2[j] = specialValuesInt[y];
+            fp[idx] = specialValues[x];
+            ip2[idx] = specialValuesInt[y];
             ++x;
             if (x >= specialValuesCount)
             {
@@ -530,10 +527,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
+        p[idx] = genrand_int32(d);
+        p2[idx] = genrand_int32(d);
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -550,7 +547,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -618,11 +615,12 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     r = (float *)gOut_Ref + thread_id * buffer_elements;
     s = (float *)gIn + thread_id * buffer_elements;
     s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
+    for (size_t j = 0; j < buffer_elements; j++)
+        r[j] = (float)func.f_fi(s[j], s2[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
@@ -638,9 +636,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     t = (cl_uint *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_uint *q = out[k];
 
@@ -707,7 +705,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 740c9d7f90..5481f12773 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -284,7 +284,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     double maxErrorVal2 = 0.0;
@@ -315,7 +314,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -338,7 +337,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_double),
@@ -365,7 +364,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -407,7 +406,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -431,12 +430,12 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -444,12 +443,12 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -472,7 +471,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
     cl_ulong *t;
@@ -485,7 +483,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -504,11 +502,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
-    j = 0;
+    cl_uint idx = 0;
     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_double *fp2 = (cl_double *)p2;
@@ -517,10 +515,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            fp[j] = specialValues[x];
-            fp2[j] = specialValues[y];
+            fp[idx] = specialValues[x];
+            fp2[idx] = specialValues[y];
             if (++x >= specialValuesCount)
             {
                 x = 0;
@@ -531,10 +529,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        p[j] = genrand_int64(d);
-        p2[j] = genrand_int64(d);
+        p[idx] = genrand_int64(d);
+        p2[idx] = genrand_int64(d);
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -551,7 +549,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -619,12 +617,12 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
     s = (cl_double *)gIn + thread_id * buffer_elements;
     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
@@ -640,9 +638,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
@@ -763,7 +761,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index 921f7101c5..ccaef604be 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -274,7 +274,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     double maxErrorVal2 = 0.0;
@@ -307,7 +306,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -330,7 +329,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_float),
@@ -357,7 +356,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
@@ -399,7 +398,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -423,12 +422,12 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -436,12 +435,12 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -464,7 +463,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
     const char *name = job->f->name;
@@ -482,7 +480,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -501,12 +499,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-
+    cl_uint idx = 0;
     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     {
         // Insert special values
         uint32_t x, y;
@@ -514,10 +511,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            p[j] = ((cl_uint *)specialValues)[x];
-            p2[j] = ((cl_uint *)specialValues)[y];
+            p[idx] = ((cl_uint *)specialValues)[x];
+            p2[idx] = ((cl_uint *)specialValues)[y];
             ++x;
             if (x >= specialValuesCount)
             {
@@ -527,28 +524,28 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             }
             if (relaxedMode && strcmp(name, "divide") == 0)
             {
-                cl_uint pj = p[j] & 0x7fffffff;
-                cl_uint p2j = p2[j] & 0x7fffffff;
+                cl_uint pj = p[idx] & 0x7fffffff;
+                cl_uint p2j = p2[idx] & 0x7fffffff;
                 // Replace values outside [2^-62, 2^62] with QNaN
-                if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000;
-                if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000;
+                if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
+                if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
             }
         }
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
+        p[idx] = genrand_int32(d);
+        p2[idx] = genrand_int32(d);
 
         if (relaxedMode && strcmp(name, "divide") == 0)
         {
-            cl_uint pj = p[j] & 0x7fffffff;
-            cl_uint p2j = p2[j] & 0x7fffffff;
+            cl_uint pj = p[idx] & 0x7fffffff;
+            cl_uint p2j = p2[idx] & 0x7fffffff;
             // Replace values outside [2^-62, 2^62] with QNaN
-            if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000;
-            if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000;
+            if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
+            if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
         }
     }
 
@@ -566,7 +563,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -649,12 +646,12 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     s2 = (float *)gIn2 + thread_id * buffer_elements;
     if (gInfNanSupport)
     {
-        for (j = 0; j < buffer_elements; j++)
+        for (size_t j = 0; j < buffer_elements; j++)
             r[j] = (float)func.f_ff(s[j], s2[j]);
     }
     else
     {
-        for (j = 0; j < buffer_elements; j++)
+        for (size_t j = 0; j < buffer_elements; j++)
         {
             feclearexcept(FE_OVERFLOW);
             r[j] = (float)func.f_ff(s[j], s2[j]);
@@ -669,7 +666,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
@@ -685,9 +682,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     t = (cl_uint *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_uint *q = out[k];
 
@@ -892,7 +889,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index e6243754d8..50250f9db4 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -154,13 +154,12 @@ static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     double *r = cri->r + off;
     int *i = cri->i + off;
     long double (*f)(long double, long double, int *) = cri->f_ffpI;
-    cl_uint j;
 
     if (off + count > lim) count = lim - off;
 
     Force64BitFPUPrecision();
 
-    for (j = 0; j < count; ++j)
+    for (cl_uint j = 0; j < count; ++j)
         r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
 
     return CL_SUCCESS;
@@ -168,8 +167,6 @@ static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
 
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -198,12 +195,12 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
@@ -224,7 +221,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -249,7 +246,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_double) * sizeValues[j];
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -311,12 +308,12 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         {
             double *r = (double *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
                 r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
         }
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -339,9 +336,9 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)gOut[k];
                 int32_t *q2 = (int32_t *)gOut2[k];
@@ -572,7 +569,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index 476ae570d6..dfdd4a2ec4 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -152,11 +152,10 @@ static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
     float *r = cri->r + off;
     int *i = cri->i + off;
     double (*f)(double, double, int *) = cri->f_ffpI;
-    cl_uint j;
 
     if (off + count > lim) count = lim - off;
 
-    for (j = 0; j < count; ++j)
+    for (cl_uint j = 0; j < count; ++j)
         r[j] = (float)f((double)x[j], (double)y[j], i + j);
 
     return CL_SUCCESS;
@@ -164,8 +163,6 @@ static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
 
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
@@ -199,12 +196,12 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         cl_uint *p = (cl_uint *)gIn;
         cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
@@ -225,7 +222,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -250,7 +247,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -312,12 +309,12 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             float *r = (float *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
         }
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -340,9 +337,9 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)gOut2[k];
@@ -557,7 +554,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index 451e43e8a5..4383fa8b2f 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -119,8 +119,6 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -148,18 +146,18 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
 
@@ -171,7 +169,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -186,7 +184,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -219,11 +217,11 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         double *s = (double *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
             r[j] = f->dfunc.i_f(s[j]);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -238,9 +236,9 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 // If we aren't getting the correctly rounded result
@@ -294,7 +292,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 exit:
     RestoreFPState(&oldMode);
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index 8883d7a1b2..c803aa3252 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -117,8 +117,6 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -145,18 +143,18 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         cl_uint *p = (cl_uint *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (cl_uint)i + j * scale;
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
 
@@ -168,7 +166,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -183,7 +181,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -216,11 +214,11 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         float *s = (float *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = f->func.i_f(s[j]);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -235,9 +233,9 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 // If we aren't getting the correctly rounded result
@@ -290,7 +288,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 exit:
     RestoreFPState(&oldMode);
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 99c71b68dc..d09915f6ee 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -273,7 +273,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -300,7 +299,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -323,7 +322,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (size_t i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_double),
@@ -350,7 +349,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -403,12 +402,12 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -416,12 +415,12 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -442,7 +441,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     dptr dfunc = job->f->dfunc;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
     cl_long *t;
@@ -455,7 +453,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_long *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_long *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -474,21 +472,21 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     double *p = (double *)gIn + thread_id * buffer_elements;
     double *p2 = (double *)gIn2 + thread_id * buffer_elements;
-    j = 0;
+    cl_uint idx = 0;
     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     { // test edge cases
         uint32_t x, y;
 
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            p[j] = specialValues[x];
-            p2[j] = specialValues[y];
+            p[idx] = specialValues[x];
+            p2[idx] = specialValues[y];
             if (++x >= specialValuesCount)
             {
                 x = 0;
@@ -499,10 +497,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        ((cl_ulong *)p)[j] = genrand_int64(d);
-        ((cl_ulong *)p2)[j] = genrand_int64(d);
+        ((cl_ulong *)p)[idx] = genrand_int64(d);
+        ((cl_ulong *)p2)[idx] = genrand_int64(d);
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -519,7 +517,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -587,11 +585,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
     s = (cl_double *)gIn + thread_id * buffer_elements;
     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_ff(s[j], s2[j]);
+    for (size_t j = 0; j < buffer_elements; j++) r[j] = dfunc.i_ff(s[j], s2[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_long *)clEnqueueMapBuffer(
@@ -607,7 +605,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     t = (cl_long *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
         cl_long *q = out[0];
 
@@ -656,7 +654,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
 
 
-        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
             q = (cl_long *)out[k];
             // If we aren't getting the correctly rounded result
@@ -704,7 +702,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index 44050b7c19..c530cdafde 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -263,7 +263,6 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
@@ -291,7 +290,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -314,7 +313,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_float),
@@ -341,7 +340,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -394,12 +393,12 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -407,12 +406,12 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -433,7 +432,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     fptr func = job->f->func;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
     cl_int *t = 0;
@@ -444,7 +442,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_int *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_int *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -463,12 +461,12 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
+    cl_uint idx = 0;
 
     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if (job_id <= (cl_uint)indx)
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
     { // test edge cases
         float *fp = (float *)p;
         float *fp2 = (float *)p2;
@@ -477,10 +475,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
-        for (; j < buffer_elements; j++)
+        for (; idx < buffer_elements; idx++)
         {
-            fp[j] = specialValues[x];
-            fp2[j] = specialValues[y];
+            fp[idx] = specialValues[x];
+            fp2[idx] = specialValues[y];
             ++x;
             if (x >= specialValuesCount)
             {
@@ -492,10 +490,10 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Init any remaining values.
-    for (; j < buffer_elements; j++)
+    for (; idx < buffer_elements; idx++)
     {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
+        p[idx] = genrand_int32(d);
+        p2[idx] = genrand_int32(d);
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -512,7 +510,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         goto exit;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -580,11 +578,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
     s = (float *)gIn + thread_id * buffer_elements;
     s2 = (float *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
+    for (size_t j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_int *)clEnqueueMapBuffer(
@@ -600,7 +598,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     t = (cl_int *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
         cl_int *q = out[0];
 
@@ -646,7 +644,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             goto exit;
         }
 
-        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
@@ -693,7 +691,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 49852a85fd..00e65a2cff 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -151,7 +151,6 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -178,7 +177,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -201,7 +200,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_double),
@@ -218,7 +217,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -269,12 +268,12 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -282,10 +281,10 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -306,7 +305,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     ThreadInfo *tinfo = job->tinfo + thread_id;
     dptr dfunc = job->f->dfunc;
     int ftz = job->ftz;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
 
@@ -315,7 +313,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_long *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_long *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -333,7 +331,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Write the new values to the input array
     cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
         p[j] = DoubleFromUInt32(base + j * scale);
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -343,7 +341,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         return error;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -404,11 +402,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
     cl_double *s = (cl_double *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
+    for (size_t j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_long *)clEnqueueMapBuffer(
@@ -424,7 +422,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     cl_long *t = (cl_long *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
         cl_long *q = out[0];
 
@@ -450,7 +448,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
 
 
-        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
@@ -476,7 +474,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 116f8d74c3..3c1717acb4 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -150,7 +150,6 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
@@ -178,7 +177,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -201,7 +200,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_float),
@@ -218,7 +217,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -269,12 +268,12 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -282,10 +281,10 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -306,7 +305,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     ThreadInfo *tinfo = job->tinfo + thread_id;
     fptr func = job->f->func;
     int ftz = job->ftz;
-    cl_uint j, k;
     cl_int error = CL_SUCCESS;
     cl_int ret = CL_SUCCESS;
     const char *name = job->f->name;
@@ -319,7 +317,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_int *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_int *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -337,7 +335,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
+    for (size_t j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
                                       buffer_size, p, 0, NULL, NULL)))
@@ -346,7 +344,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         return error;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -407,11 +405,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
     float *s = (float *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
+    for (size_t j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_int *)clEnqueueMapBuffer(
@@ -427,9 +425,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     cl_int *t = (cl_int *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_int *q = out[0];
 
@@ -456,7 +454,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             }
 
 
-            for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+            for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex;
+                 k++)
             {
                 q = out[k];
                 // If we aren't getting the correctly rounded result
@@ -486,7 +485,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 exit:
     ret = error;
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index aa24507329..a32cd5a853 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -132,8 +132,6 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -155,13 +153,13 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
@@ -190,7 +188,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -205,7 +203,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_double) * sizeValues[j];
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -252,11 +250,11 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
             r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -293,7 +291,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
index 5d99eb0e93..095a22ff3b 100644
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -130,8 +130,6 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
@@ -154,13 +152,13 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         cl_uint *p = (cl_uint *)gIn;
         cl_uint *p2 = (cl_uint *)gIn2;
         cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
@@ -189,7 +187,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -204,7 +202,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -251,11 +249,11 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
         float *s3 = (float *)gIn3;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -292,7 +290,7 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 02d06fe315..606fdc5ad4 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -208,8 +208,6 @@ static const size_t specialValuesCount =
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                                          bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -234,22 +232,23 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        j = 0;
+        size_t idx = 0;
+
         if (i == 0)
         { // test edge cases
             uint32_t x, y, z;
             x = y = z = 0;
-            for (; j < BUFFER_SIZE / sizeof(double); j++)
+            for (; idx < BUFFER_SIZE / sizeof(double); idx++)
             {
-                p[j] = specialValues[x];
-                p2[j] = specialValues[y];
-                p3[j] = specialValues[z];
+                p[idx] = specialValues[x];
+                p2[idx] = specialValues[y];
+                p3[idx] = specialValues[z];
                 if (++x >= specialValuesCount)
                 {
                     x = 0;
@@ -260,15 +259,15 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                     }
                 }
             }
-            if (j == BUFFER_SIZE / sizeof(double))
+            if (idx == BUFFER_SIZE / sizeof(double))
                 vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for (; j < BUFFER_SIZE / sizeof(double); j++)
+        for (; idx < BUFFER_SIZE / sizeof(double); idx++)
         {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-            p3[j] = DoubleFromUInt32(genrand_int32(d));
+            p[idx] = DoubleFromUInt32(genrand_int32(d));
+            p2[idx] = DoubleFromUInt32(genrand_int32(d));
+            p3[idx] = DoubleFromUInt32(genrand_int32(d));
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -293,7 +292,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -308,7 +307,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_double) * sizeValues[j];
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -355,11 +354,11 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
             r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -374,9 +373,9 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
 
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
 
@@ -731,7 +730,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index 5ad564aae3..e52c0a0f41 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -215,8 +215,6 @@ static const size_t specialValuesCount =
 
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
@@ -250,13 +248,14 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         cl_uint *p = (cl_uint *)gIn;
         cl_uint *p2 = (cl_uint *)gIn2;
         cl_uint *p3 = (cl_uint *)gIn3;
-        j = 0;
+        size_t idx = 0;
+
         if (i == 0)
         { // test edge cases
             float *fp = (float *)gIn;
@@ -264,11 +263,11 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             float *fp3 = (float *)gIn3;
             uint32_t x, y, z;
             x = y = z = 0;
-            for (; j < BUFFER_SIZE / sizeof(float); j++)
+            for (; idx < BUFFER_SIZE / sizeof(float); idx++)
             {
-                fp[j] = specialValues[x];
-                fp2[j] = specialValues[y];
-                fp3[j] = specialValues[z];
+                fp[idx] = specialValues[x];
+                fp2[idx] = specialValues[y];
+                fp3[idx] = specialValues[z];
 
                 if (++x >= specialValuesCount)
                 {
@@ -280,15 +279,15 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                     }
                 }
             }
-            if (j == BUFFER_SIZE / sizeof(float))
+            if (idx == BUFFER_SIZE / sizeof(float))
                 vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for (; j < BUFFER_SIZE / sizeof(float); j++)
+        for (; idx < BUFFER_SIZE / sizeof(float); idx++)
         {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-            p3[j] = genrand_int32(d);
+            p[idx] = genrand_int32(d);
+            p2[idx] = genrand_int32(d);
+            p3[idx] = genrand_int32(d);
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -313,7 +312,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -328,7 +327,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
@@ -377,7 +376,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         float *s3 = (float *)gIn3;
         if (skipNanInf)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 feclearexcept(FE_OVERFLOW);
                 r[j] =
@@ -388,13 +387,13 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 r[j] =
                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
         }
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -409,9 +408,9 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
 
@@ -866,7 +865,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index fe3edef7a4..f6fa326447 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -159,7 +159,6 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
 
@@ -189,7 +188,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -212,7 +211,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_double),
@@ -229,7 +228,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -269,7 +268,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -292,12 +291,12 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -305,10 +304,10 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -329,7 +328,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     ThreadInfo *tinfo = job->tinfo + thread_id;
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
-    cl_uint j, k;
     cl_int error;
     int ftz = job->ftz;
 
@@ -338,7 +336,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -356,7 +354,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Write the new values to the input array
     cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
         p[j] = DoubleFromUInt32(base + j * scale);
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -366,7 +364,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         return error;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -428,11 +426,12 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
     cl_double *s = (cl_double *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
+    for (size_t j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_f(s[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ulong *)clEnqueueMapBuffer(
@@ -448,9 +447,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     cl_ulong *t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
@@ -516,7 +515,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index a0e45a2a31..17edc58d9d 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -157,7 +157,6 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfo test_info;
     cl_int error;
-    size_t i, j;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
@@ -189,7 +188,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.relaxedMode = relaxedMode;
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         size_t array_size = test_info.threadCount * sizeof(cl_kernel);
         test_info.k[i] = (cl_kernel *)malloc(array_size);
@@ -212,7 +211,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     memset(test_info.tinfo, 0,
            test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_float),
@@ -229,7 +228,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
         }
 
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
@@ -287,7 +286,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             if (test_info.tinfo[i].maxError > maxError)
             {
@@ -316,12 +315,12 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
         if (test_info.k[i])
         {
-            for (j = 0; j < test_info.threadCount; j++)
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
             free(test_info.k[i]);
@@ -329,10 +328,10 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
     if (test_info.tinfo)
     {
-        for (i = 0; i < test_info.threadCount; i++)
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
@@ -360,7 +359,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         func = job->f->rfunc;
     }
 
-    cl_uint j, k;
     cl_int error;
 
     int isRangeLimited = job->isRangeLimited;
@@ -370,7 +368,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
@@ -388,7 +386,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Write the new values to the input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
         p[j] = base + j * scale;
         if (relaxedMode)
@@ -421,7 +419,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         return error;
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         // Wait for the map to finish
         if ((error = clWaitForEvents(1, e + j)))
@@ -482,11 +480,11 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     float *r = (float *)gOut_Ref + thread_id * buffer_elements;
     float *s = (float *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
+    for (size_t j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_uint *)clEnqueueMapBuffer(
@@ -502,9 +500,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Verify data
     uint32_t *t = (uint32_t *)r;
-    for (j = 0; j < buffer_elements; j++)
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             uint32_t *q = out[k];
 
@@ -695,7 +693,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index 6f3a080bad..71dd4f4431 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -126,8 +126,6 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -154,18 +152,18 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -176,7 +174,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -201,7 +199,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
@@ -240,7 +238,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         double *r = (double *)gOut_Ref;
         double *r2 = (double *)gOut_Ref2;
         double *s = (double *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             long double dd;
             r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
@@ -248,7 +246,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -271,9 +269,9 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         uint64_t *t2 = (uint64_t *)gOut_Ref2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
                 uint64_t *q2 = (uint64_t *)(gOut2[k]);
@@ -438,7 +436,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index f3cf032d68..639a920551 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -124,8 +124,6 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     uint32_t l;
     int error;
     char const *testing_mode;
@@ -155,13 +153,13 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         uint32_t *p = (uint32_t *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 p[j] = (uint32_t)i + j * scale;
                 if (relaxedMode && strcmp(f->name, "sincos") == 0)
@@ -173,7 +171,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 p[j] = (uint32_t)i + j;
                 if (relaxedMode && strcmp(f->name, "sincos") == 0)
@@ -192,7 +190,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -217,7 +215,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
@@ -272,7 +270,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 
         if (skipNanInf)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 double dd;
                 feclearexcept(FE_OVERFLOW);
@@ -289,7 +287,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             {
                 double dd;
                 if (relaxedMode)
@@ -304,7 +302,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         if (isFract && ftz) RestoreFPState(&oldMode);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -331,9 +329,9 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         uint32_t *t2 = (uint32_t *)gOut_Ref2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)gOut[k];
                 uint32_t *q2 = (uint32_t *)gOut2[k];
@@ -572,7 +570,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index df1a5aa8d1..251ffb0be1 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -133,8 +133,6 @@ static cl_ulong abs_cl_long(cl_long i)
 
 int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -162,18 +160,18 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -184,7 +182,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -209,7 +207,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
@@ -248,11 +246,11 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         double *r = (double *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         double *s = (double *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
             r[j] = (double)f->dfunc.f_fpI(s[j], r2 + j);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -275,9 +273,9 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)(gOut2[k]);
@@ -409,7 +407,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index c95ee061e4..030de58adc 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -131,8 +131,6 @@ static cl_ulong abs_cl_long(cl_long i)
 
 int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -165,18 +163,18 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         uint32_t *p = (uint32_t *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -187,7 +185,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -212,7 +210,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
@@ -251,11 +249,11 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         float *r = (float *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         float *s = (float *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = (float)f->func.f_fpI(s[j], r2 + j);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -278,9 +276,9 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)(gOut2[k]);
@@ -407,7 +405,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index 5f252614cc..940b0f884d 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -126,8 +126,6 @@ static cl_ulong random64(MTdata d)
 
 int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -150,11 +148,12 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         cl_ulong *p = (cl_ulong *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_ulong); j++) p[j] = random64(d);
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_ulong); j++)
+            p[j] = random64(d);
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn, 0, NULL, NULL)))
@@ -164,7 +163,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -179,7 +178,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
@@ -211,11 +210,11 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         cl_ulong *s = (cl_ulong *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
             r[j] = (double)f->dfunc.f_u(s[j]);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -230,9 +229,9 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
 
@@ -306,7 +305,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 18534dc57b..5c8f6ae6e6 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -118,8 +118,6 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
@@ -165,18 +163,18 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     }
 
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         uint32_t *p = (uint32_t *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+            for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -187,7 +185,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -202,7 +200,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
@@ -234,11 +232,11 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         cl_uint *s = (cl_uint *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             r[j] = (float)f->func.f_u(s[j]);
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
@@ -254,9 +252,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
 
@@ -339,7 +337,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 
 exit:
     // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);

From 5daca107addf18872c4cd25403d1cd5f52dc0315 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 7 May 2021 09:28:38 +0100
Subject: [PATCH 080/158] Remove dead variables and functions (#1238)

These were identified using Clang's -Wunused warning flag.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_float.cpp         |  1 -
 .../binary_operator_double.cpp                |  1 -
 .../math_brute_force/reference_math.cpp       | 25 -------------------
 .../unary_two_results_float.cpp               |  2 --
 4 files changed, 29 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 80ddba6fd3..32caafa353 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -279,7 +279,6 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     double maxErrorVal2 = 0.0;
-    int skipTestingRelaxed = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 5481f12773..21e76c8553 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -469,7 +469,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
-    bool relaxedMode = job->relaxedMode;
     MTdata d = tinfo->d;
     cl_int error;
     const char *name = job->f->name;
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index cca134d4c4..3a6516bae8 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -382,8 +382,6 @@ static float round_toward_zero_float(cl_ulong p, int exponent)
 
 static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
 {
-    extern int gCheckTininessBeforeRounding;
-
     union {
         cl_uint u;
         cl_float d;
@@ -2682,20 +2680,6 @@ static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
     *lo = (aloblo & 0xffffffffULL) | (alobhi << 32);
 }
 
-// Move the most significant non-zero bit to the MSB
-// Note: not general. Only works if the most significant non-zero bit is at
-// MSB-1
-static inline void renormalize(cl_ulong *hi, cl_ulong *lo, int *exponent)
-{
-    if (0 == (0x8000000000000000ULL & *hi))
-    {
-        *hi <<= 1;
-        *hi |= *lo >> 63;
-        *lo <<= 1;
-        *exponent -= 1;
-    }
-}
-
 static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
                                            int exponent)
 {
@@ -2990,8 +2974,6 @@ long double reference_recipl(long double x) { return 1.0L / x; }
 
 long double reference_rootnl(long double x, int i)
 {
-    double hi, lo;
-    long double l;
     // rootn ( x, 0 )  returns a NaN.
     if (0 == i) return cl_make_nan();
 
@@ -3290,7 +3272,6 @@ long double reference_cbrtl(long double x)
 
     if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x;
 
-    double iy = 0.0;
     double log2x_hi, log2x_lo;
 
     // extended precision log .... accurate to at least 64-bits + couple of
@@ -3604,12 +3585,6 @@ long double reference_expm1l(long double x)
     // unimplemented
     return x;
 #else
-    union {
-        double f;
-        cl_ulong u;
-    } u;
-    u.f = (double)x;
-
     if (reference_isnanl(x)) return x;
 
     if (x > 710) return INFINITY;
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index 639a920551..4a375ce3fd 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -124,9 +124,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint32_t l;
     int error;
-    char const *testing_mode;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError0 = 0.0f;

From 3dab3df48d7dbc22accf6c37c59e54e35a35de7f Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 7 May 2021 09:29:28 +0100
Subject: [PATCH 081/158] Report inputs for worst errors (#1231)

Builtin functions producing two results can have their worst error, for
each result, on different inputs. Report both inputs.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_two_results_i_double.cpp           | 3 ++-
 .../math_brute_force/binary_two_results_i_float.cpp            | 3 ++-
 .../math_brute_force/unary_two_results_i_double.cpp            | 3 ++-
 .../math_brute_force/unary_two_results_i_float.cpp             | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index 50250f9db4..14f4109239 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -562,7 +562,8 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         else
             vlog("passed");
 
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
+             maxErrorVal2);
     }
 
     vlog("\n");
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index dfdd4a2ec4..5ef44b6e0c 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -547,7 +547,8 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         else
             vlog("passed");
 
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
+             maxErrorVal2);
     }
 
     vlog("\n");
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 251ffb0be1..14d1fb99f9 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -400,7 +400,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         else
             vlog("passed");
 
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
+             maxErrorVal2);
     }
 
     vlog("\n");
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index 030de58adc..23b0d7076b 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -398,7 +398,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
         else
             vlog("passed");
 
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+        vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal,
+             maxErrorVal2);
     }
 
     vlog("\n");

From 3dd6d4137dee1fc275e014e3efb3ed9086d5c79e Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 11 May 2021 18:06:16 +0100
Subject: [PATCH 082/158] Avoid manual memory management, fixes #975 (#1240)

Fix heap-buffer-overflow reported by AddressSanitizer: ensure the
appropriate number of elements are allocated for the list of tests.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/main.cpp | 32 +++++++++-------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 7b06ca87aa..d6c2f11ffb 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -22,6 +22,7 @@
 #include <cstdlib>
 #include <ctime>
 #include <string>
+#include <vector>
 
 #include "harness/errorHelpers.h"
 #include "harness/kernelHelpers.h"
@@ -51,8 +52,7 @@
     (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO                  \
      | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)
 
-static const char **gTestNames = NULL;
-static unsigned int gTestNameCount = 0;
+static std::vector<const char *> gTestNames;
 static char appName[MAXPATHLEN] = "";
 cl_device_id gDevice = NULL;
 cl_context gContext = NULL;
@@ -331,13 +331,12 @@ int main(int argc, const char *argv[])
     FPU_mode_type oldMode;
     DisableFTZ(&oldMode);
 
-    int ret = runTestHarnessWithCheck(gTestNameCount, gTestNames, test_num,
-                                      test_list, true, 0, InitCL);
+    int ret = runTestHarnessWithCheck(gTestNames.size(), gTestNames.data(),
+                                      test_num, test_list, true, 0, InitCL);
 
     RestoreFPState(&oldMode);
 
     free_mtdata(gMTdata);
-    free(gTestNames);
 
     if (gQueue)
     {
@@ -352,15 +351,12 @@ int main(int argc, const char *argv[])
 
 static int ParseArgs(int argc, const char **argv)
 {
-    int i;
-    gTestNames = (const char **)calloc(argc - 1, sizeof(char *));
-    if (NULL == gTestNames)
-    {
-        vlog("Failed to allocate memory for gTestNames array.\n");
-        return 1;
-    }
-    gTestNames[0] = argv[0];
-    gTestNameCount = 1;
+    // We only pass test names to runTestHarnessWithCheck, hence global command
+    // line options defined by the harness cannot be used by the user.
+    // To respect the implementation details of runTestHarnessWithCheck,
+    // gTestNames[0] has to exist although its value is not important.
+    gTestNames.push_back("");
+
     int singleThreaded = 0;
 
     { // Extract the app name
@@ -380,7 +376,7 @@ static int ParseArgs(int argc, const char **argv)
     }
 
     vlog("\n%s\t", appName);
-    for (i = 1; i < argc; i++)
+    for (int i = 1; i < argc; i++)
     {
         const char *arg = argv[i];
         if (NULL == arg) break;
@@ -485,16 +481,14 @@ static int ParseArgs(int argc, const char **argv)
                     const Func *f = functionList + k;
                     if (strcmp(arg, f->name) == 0)
                     {
-                        gTestNames[gTestNameCount] = arg;
-                        gTestNameCount++;
+                        gTestNames.push_back(arg);
                         break;
                     }
                 }
                 // If we didn't find it in the list of test names
                 if (k >= functionListCount)
                 {
-                    gTestNames[gTestNameCount] = arg;
-                    gTestNameCount++;
+                    gTestNames.push_back(arg);
                 }
             }
         }

From 59a12047a8e4c98002705374596f99921a184ded Mon Sep 17 00:00:00 2001
From: sravikumar3393 <60196293+sravikumar3393@users.noreply.github.com>
Date: Tue, 11 May 2021 10:17:48 -0700
Subject: [PATCH 083/158] Fix for test_conversions failure with Clang build on
 Linux #1057 (#1062)

* Avoid optimization by using volatile qualifier
    * Fix both uint2float and ulong2double
---
 test_conformance/conversions/basic_test_conversions.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index 44417262f0..329988414d 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -678,7 +678,8 @@ static void uint2short( void *out, void *in){ ((short*) out)[0] = ((cl_uint*) in
 static void uint2int( void *out, void *in){ ((cl_int*) out)[0] = ((cl_uint*) in)[0]; }
 static void uint2float( void *out, void *in)
 {
-    cl_uint l = ((cl_uint*) in)[0];
+    // Use volatile to prevent optimization by Clang compiler
+    volatile cl_uint l = ((cl_uint *)in)[0];
     ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
 }
 static void uint2double( void *out, void *in)
@@ -791,7 +792,8 @@ static void ulong2double( void *out, void *in)
 #endif
     ((double*) out)[0] = (l == 0 ? 0.0 : (((cl_long)l < 0) ? result * 2.0 : result));
 #else
-    cl_ulong l = ((cl_ulong*) in)[0];
+    // Use volatile to prevent optimization by Clang compiler
+    volatile cl_ulong l = ((cl_ulong *)in)[0];
     ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
 #endif
 }

From 06f7661fdcd99185361f12242ab4ec68d0bff83f Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierremoreau@users.noreply.github.com>
Date: Wed, 12 May 2021 12:39:17 +0200
Subject: [PATCH 084/158] basic/async: Check for extension only once per test
 (#1242)

As the extension is vector size and type independent, we only need to
check for the extension once per test and not for every possible
combination of inputs, thereby drastically reducing the log output of
the test when the extension is not supported.
---
 test_conformance/basic/test_async_copy2D.cpp     | 14 +++++++-------
 test_conformance/basic/test_async_copy3D.cpp     | 14 +++++++-------
 test_conformance/basic/test_async_copy_fence.cpp | 16 ++++++++--------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp
index 2b534497fa..9fbdcb6e39 100644
--- a/test_conformance/basic/test_async_copy2D.cpp
+++ b/test_conformance/basic/test_async_copy2D.cpp
@@ -117,13 +117,6 @@ int test_copy2D(cl_device_id deviceID, cl_context context,
     log_info("Testing %s with srcStride = %d, dstStride = %d\n", vecNameString,
              srcStride, dstStride);
 
-    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
-    {
-        log_info(
-            "Device does not support extended async copies. Skipping test.\n");
-        return 0;
-    }
-
     cl_long max_local_mem_size;
     error =
         clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
@@ -386,6 +379,13 @@ int test_copy2D_all_types(cl_device_id deviceID, cl_context context,
 
     int errors = 0;
 
+    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
+    {
+        log_info(
+            "Device does not support extended async copies. Skipping test.\n");
+        return 0;
+    }
+
     for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
     {
         if (vecType[typeIndex] == kDouble
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp
index af10191fee..252159bc2b 100644
--- a/test_conformance/basic/test_async_copy3D.cpp
+++ b/test_conformance/basic/test_async_copy3D.cpp
@@ -138,13 +138,6 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
              vecNameString, srcLineStride, dstLineStride, srcPlaneStride,
              dstPlaneStride);
 
-    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
-    {
-        log_info(
-            "Device does not support extended async copies. Skipping test.\n");
-        return 0;
-    }
-
     cl_long max_local_mem_size;
     error =
         clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
@@ -467,6 +460,13 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
 
     int errors = 0;
 
+    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
+    {
+        log_info(
+            "Device does not support extended async copies. Skipping test.\n");
+        return 0;
+    }
+
     for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
     {
         if (vecType[typeIndex] == kDouble
diff --git a/test_conformance/basic/test_async_copy_fence.cpp b/test_conformance/basic/test_async_copy_fence.cpp
index 74f6e40715..43245dae1c 100644
--- a/test_conformance/basic/test_async_copy_fence.cpp
+++ b/test_conformance/basic/test_async_copy_fence.cpp
@@ -360,14 +360,6 @@ int test_copy_fence(cl_device_id deviceID, cl_context context,
     size_t elementSize = get_explicit_type_size(vecType) * vecSize;
     log_info("Testing %s\n", vecNameString);
 
-    if (!is_extension_available(deviceID, "cl_khr_async_work_group_copy_fence"))
-    {
-        log_info(
-            "Device does not support extended async copies fence. Skipping "
-            "test.\n");
-        return 0;
-    }
-
     cl_long max_local_mem_size;
     error =
         clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
@@ -732,6 +724,14 @@ int test_copy_fence_all_types(cl_device_id deviceID, cl_context context,
 
     int errors = 0;
 
+    if (!is_extension_available(deviceID, "cl_khr_async_work_group_copy_fence"))
+    {
+        log_info(
+            "Device does not support extended async copies fence. Skipping "
+            "test.\n");
+        return 0;
+    }
+
     for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
     {
         if (vecType[typeIndex] == kDouble

From d7f87492bda474bda9b8bc3a722b2740664a0930 Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierremoreau@users.noreply.github.com>
Date: Wed, 12 May 2021 12:39:45 +0200
Subject: [PATCH 085/158] testHarness: Print error string when clFinish fails
 (#1243)

---
 test_common/harness/testHarness.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 5d96c43f56..1aec3d0789 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -861,7 +861,7 @@ test_status callSingleTestFunction(test_definition test,
         int error = clFinish(queue);
         if (error)
         {
-            log_error("clFinish failed: %d", error);
+            log_error("clFinish failed: %s\n", IGetErrorString(error));
             status = TEST_FAIL;
         }
         clReleaseCommandQueue(queue);

From ad8ab3fe90467c477ffd2299f5f887d5d312df46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Thu, 13 May 2021 09:13:03 +0100
Subject: [PATCH 086/158] Remove OpenCL C++ tests (#1241)

* Remove OpenCL C++ tests

Agreed in the 2021/05/11 teleconference.

Signed-off-by: Kevin Petit <kevin.petit@arm.com>

* fix CI
---
 CMakeLists.txt                                |  53 +-
 presubmit.sh                                  |   6 +-
 test_common/harness/kernelHelpers.cpp         | 219 +-----
 test_common/harness/kernelHelpers.h           |  11 +-
 test_conformance/CMakeLists.txt               |   3 -
 test_conformance/clcpp/CMakeLists.txt         |  21 -
 .../clcpp/address_spaces/CMakeLists.txt       |   7 -
 .../clcpp/address_spaces/common.hpp           | 202 -----
 .../clcpp/address_spaces/main.cpp             |  25 -
 .../address_spaces/test_pointer_types.hpp     | 412 ----------
 .../address_spaces/test_storage_types.hpp     | 418 ----------
 test_conformance/clcpp/api/CMakeLists.txt     |   7 -
 test_conformance/clcpp/api/main.cpp           |  27 -
 test_conformance/clcpp/api/test_ctors.hpp     | 487 ------------
 .../clcpp/api/test_ctors_dtors.hpp            | 185 -----
 test_conformance/clcpp/api/test_dtors.hpp     | 559 -------------
 .../clcpp/api/test_spec_consts.hpp            | 480 ------------
 test_conformance/clcpp/atomics/CMakeLists.txt |   7 -
 .../clcpp/atomics/atomic_fetch.hpp            | 308 --------
 test_conformance/clcpp/atomics/main.cpp       |  25 -
 .../clcpp/attributes/CMakeLists.txt           |   7 -
 test_conformance/clcpp/attributes/main.cpp    |  27 -
 .../clcpp/attributes/test_ivdep.hpp           | 418 ----------
 .../clcpp/attributes/test_max_size.hpp        | 266 -------
 .../test_required_num_sub_groups.hpp          | 285 -------
 test_conformance/clcpp/common.hpp             |  51 --
 .../clcpp/common_funcs/CMakeLists.txt         |   7 -
 .../clcpp/common_funcs/common_funcs.hpp       | 417 ----------
 test_conformance/clcpp/common_funcs/main.cpp  |  43 -
 test_conformance/clcpp/convert/CMakeLists.txt |   7 -
 .../clcpp/convert/convert_cast.hpp            | 309 --------
 test_conformance/clcpp/convert/main.cpp       |  25 -
 .../clcpp/device_queue/CMakeLists.txt         |   7 -
 test_conformance/clcpp/device_queue/main.cpp  |  25 -
 .../clcpp/device_queue/test_enqueue.hpp       | 699 -----------------
 test_conformance/clcpp/funcs_test_utils.hpp   |  72 --
 .../clcpp/geometric_funcs/CMakeLists.txt      |   7 -
 .../geometric_funcs/fast_geometric_funcs.hpp  | 229 ------
 .../clcpp/geometric_funcs/geometric_funcs.hpp | 389 ----------
 .../clcpp/geometric_funcs/main.cpp            |  44 --
 test_conformance/clcpp/images/CMakeLists.txt  |   7 -
 test_conformance/clcpp/images/common.hpp      | 195 -----
 test_conformance/clcpp/images/main.cpp        |  30 -
 test_conformance/clcpp/images/test_read.hpp   | 307 --------
 test_conformance/clcpp/images/test_sample.hpp | 363 ---------
 test_conformance/clcpp/images/test_write.hpp  | 327 --------
 .../clcpp/integer_funcs/24bit_funcs.hpp       | 142 ----
 .../clcpp/integer_funcs/CMakeLists.txt        |   7 -
 .../clcpp/integer_funcs/bitwise_funcs.hpp     | 232 ------
 .../clcpp/integer_funcs/common.hpp            |  26 -
 test_conformance/clcpp/integer_funcs/main.cpp |  26 -
 .../clcpp/integer_funcs/numeric_funcs.hpp     | 703 -----------------
 .../clcpp/math_funcs/CMakeLists.txt           |   7 -
 test_conformance/clcpp/math_funcs/common.hpp  | 347 ---------
 .../clcpp/math_funcs/comparison_funcs.hpp     |  59 --
 .../clcpp/math_funcs/exponential_funcs.hpp    | 139 ----
 .../clcpp/math_funcs/floating_point_funcs.hpp | 733 ------------------
 .../clcpp/math_funcs/half_math_funcs.hpp      | 106 ---
 .../clcpp/math_funcs/logarithmic_funcs.hpp    | 261 -------
 test_conformance/clcpp/math_funcs/main.cpp    |  50 --
 .../clcpp/math_funcs/other_funcs.hpp          |  75 --
 .../clcpp/math_funcs/power_funcs.hpp          | 153 ----
 .../clcpp/math_funcs/reference.hpp            | 315 --------
 .../clcpp/math_funcs/trigonometric_funcs.hpp  | 222 ------
 test_conformance/clcpp/pipes/CMakeLists.txt   |   7 -
 test_conformance/clcpp/pipes/main.cpp         |  25 -
 test_conformance/clcpp/pipes/test_pipes.hpp   | 632 ---------------
 .../program_scope_ctors_dtors/CMakeLists.txt  |   7 -
 .../program_scope_ctors_dtors/common.hpp      | 283 -------
 .../clcpp/program_scope_ctors_dtors/main.cpp  |  24 -
 .../test_ctors_dtors.hpp                      | 324 --------
 .../clcpp/reinterpret/CMakeLists.txt          |   7 -
 .../clcpp/reinterpret/as_type.hpp             | 223 ------
 test_conformance/clcpp/reinterpret/main.cpp   |  25 -
 .../clcpp/relational_funcs/CMakeLists.txt     |   7 -
 .../clcpp/relational_funcs/common.hpp         | 112 ---
 .../relational_funcs/comparison_funcs.hpp     | 150 ----
 .../clcpp/relational_funcs/main.cpp           |  26 -
 .../clcpp/relational_funcs/select_funcs.hpp   | 158 ----
 .../clcpp/relational_funcs/test_funcs.hpp     | 336 --------
 .../clcpp/spec_constants/CMakeLists.txt       |   7 -
 .../clcpp/spec_constants/common.hpp           | 256 ------
 .../clcpp/spec_constants/main.cpp             |  26 -
 .../test_spec_consts_attributes.hpp           | 281 -------
 .../spec_constants/test_spec_consts_if.hpp    | 161 ----
 .../test_spec_consts_init_vars.hpp            | 174 -----
 test_conformance/clcpp/spirv10_2016.04.27.7z  |   3 -
 .../clcpp/subgroups/CMakeLists.txt            |   7 -
 test_conformance/clcpp/subgroups/common.hpp   |  97 ---
 test_conformance/clcpp/subgroups/main.cpp     |  29 -
 .../clcpp/subgroups/test_sg_all.hpp           | 221 ------
 .../clcpp/subgroups/test_sg_any.hpp           | 221 ------
 .../clcpp/subgroups/test_sg_broadcast.hpp     | 206 -----
 .../clcpp/subgroups/test_sg_reduce.hpp        | 348 ---------
 .../subgroups/test_sg_scan_exclusive.hpp      | 328 --------
 .../subgroups/test_sg_scan_inclusive.hpp      | 335 --------
 .../clcpp/synchronization/CMakeLists.txt      |   7 -
 .../clcpp/synchronization/main.cpp            |  27 -
 .../synchronization/named_barrier/common.hpp  | 172 ----
 .../named_barrier/test_named_barrier.hpp      | 491 ------------
 .../named_barrier/test_spec_example.hpp       | 325 --------
 .../test_sub_group_barrier.hpp                | 342 --------
 .../test_work_group_barrier.hpp               | 330 --------
 .../clcpp/utils_common/errors.hpp             | 134 ----
 .../clcpp/utils_common/is_vector_type.hpp     |  60 --
 .../clcpp/utils_common/kernel_helpers.hpp     |  50 --
 .../clcpp/utils_common/make_vector_type.hpp   |  65 --
 .../clcpp/utils_common/scalar_type.hpp        |  64 --
 .../clcpp/utils_common/string.hpp             |  70 --
 .../clcpp/utils_common/type_name.hpp          |  65 --
 .../clcpp/utils_common/type_supported.hpp     | 106 ---
 .../clcpp/utils_common/vector_size.hpp        |  61 --
 test_conformance/clcpp/utils_test/binary.hpp  | 305 --------
 test_conformance/clcpp/utils_test/compare.hpp | 161 ----
 .../utils_test/detail/base_func_type.hpp      | 112 ---
 .../clcpp/utils_test/detail/vec_helpers.hpp   | 104 ---
 .../clcpp/utils_test/generate_inputs.hpp      | 331 --------
 test_conformance/clcpp/utils_test/ternary.hpp | 364 ---------
 test_conformance/clcpp/utils_test/unary.hpp   | 259 -------
 .../clcpp/vload_vstore/CMakeLists.txt         |   7 -
 .../clcpp/vload_vstore/common.hpp             |  82 --
 .../clcpp/vload_vstore/half_utils.hpp         |  54 --
 test_conformance/clcpp/vload_vstore/main.cpp  |  25 -
 .../clcpp/vload_vstore/vload_funcs.hpp        | 367 ---------
 .../clcpp/vload_vstore/vstore_funcs.hpp       | 349 ---------
 .../clcpp/workgroups/CMakeLists.txt           |   7 -
 test_conformance/clcpp/workgroups/common.hpp  |  97 ---
 test_conformance/clcpp/workgroups/main.cpp    |  29 -
 .../clcpp/workgroups/test_wg_all.hpp          | 220 ------
 .../clcpp/workgroups/test_wg_any.hpp          | 220 ------
 .../clcpp/workgroups/test_wg_broadcast.hpp    | 460 -----------
 .../clcpp/workgroups/test_wg_reduce.hpp       | 334 --------
 .../workgroups/test_wg_scan_exclusive.hpp     | 327 --------
 .../workgroups/test_wg_scan_inclusive.hpp     | 327 --------
 .../clcpp/workitems/CMakeLists.txt            |   7 -
 test_conformance/clcpp/workitems/main.cpp     |  25 -
 .../clcpp/workitems/test_workitems.hpp        | 417 ----------
 .../cxx_for_opencl_ext.cpp                    |   3 +-
 138 files changed, 52 insertions(+), 23984 deletions(-)
 delete mode 100644 test_conformance/clcpp/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/address_spaces/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/address_spaces/common.hpp
 delete mode 100644 test_conformance/clcpp/address_spaces/main.cpp
 delete mode 100644 test_conformance/clcpp/address_spaces/test_pointer_types.hpp
 delete mode 100644 test_conformance/clcpp/address_spaces/test_storage_types.hpp
 delete mode 100644 test_conformance/clcpp/api/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/api/main.cpp
 delete mode 100644 test_conformance/clcpp/api/test_ctors.hpp
 delete mode 100644 test_conformance/clcpp/api/test_ctors_dtors.hpp
 delete mode 100644 test_conformance/clcpp/api/test_dtors.hpp
 delete mode 100644 test_conformance/clcpp/api/test_spec_consts.hpp
 delete mode 100644 test_conformance/clcpp/atomics/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/atomics/atomic_fetch.hpp
 delete mode 100644 test_conformance/clcpp/atomics/main.cpp
 delete mode 100644 test_conformance/clcpp/attributes/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/attributes/main.cpp
 delete mode 100644 test_conformance/clcpp/attributes/test_ivdep.hpp
 delete mode 100644 test_conformance/clcpp/attributes/test_max_size.hpp
 delete mode 100644 test_conformance/clcpp/attributes/test_required_num_sub_groups.hpp
 delete mode 100644 test_conformance/clcpp/common.hpp
 delete mode 100644 test_conformance/clcpp/common_funcs/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/common_funcs/common_funcs.hpp
 delete mode 100644 test_conformance/clcpp/common_funcs/main.cpp
 delete mode 100644 test_conformance/clcpp/convert/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/convert/convert_cast.hpp
 delete mode 100644 test_conformance/clcpp/convert/main.cpp
 delete mode 100644 test_conformance/clcpp/device_queue/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/device_queue/main.cpp
 delete mode 100644 test_conformance/clcpp/device_queue/test_enqueue.hpp
 delete mode 100644 test_conformance/clcpp/funcs_test_utils.hpp
 delete mode 100644 test_conformance/clcpp/geometric_funcs/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/geometric_funcs/fast_geometric_funcs.hpp
 delete mode 100644 test_conformance/clcpp/geometric_funcs/geometric_funcs.hpp
 delete mode 100644 test_conformance/clcpp/geometric_funcs/main.cpp
 delete mode 100644 test_conformance/clcpp/images/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/images/common.hpp
 delete mode 100644 test_conformance/clcpp/images/main.cpp
 delete mode 100644 test_conformance/clcpp/images/test_read.hpp
 delete mode 100644 test_conformance/clcpp/images/test_sample.hpp
 delete mode 100644 test_conformance/clcpp/images/test_write.hpp
 delete mode 100644 test_conformance/clcpp/integer_funcs/24bit_funcs.hpp
 delete mode 100644 test_conformance/clcpp/integer_funcs/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/integer_funcs/bitwise_funcs.hpp
 delete mode 100644 test_conformance/clcpp/integer_funcs/common.hpp
 delete mode 100644 test_conformance/clcpp/integer_funcs/main.cpp
 delete mode 100644 test_conformance/clcpp/integer_funcs/numeric_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/math_funcs/common.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/comparison_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/exponential_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/floating_point_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/half_math_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/logarithmic_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/main.cpp
 delete mode 100644 test_conformance/clcpp/math_funcs/other_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/power_funcs.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/reference.hpp
 delete mode 100644 test_conformance/clcpp/math_funcs/trigonometric_funcs.hpp
 delete mode 100644 test_conformance/clcpp/pipes/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/pipes/main.cpp
 delete mode 100644 test_conformance/clcpp/pipes/test_pipes.hpp
 delete mode 100644 test_conformance/clcpp/program_scope_ctors_dtors/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/program_scope_ctors_dtors/common.hpp
 delete mode 100644 test_conformance/clcpp/program_scope_ctors_dtors/main.cpp
 delete mode 100644 test_conformance/clcpp/program_scope_ctors_dtors/test_ctors_dtors.hpp
 delete mode 100644 test_conformance/clcpp/reinterpret/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/reinterpret/as_type.hpp
 delete mode 100644 test_conformance/clcpp/reinterpret/main.cpp
 delete mode 100644 test_conformance/clcpp/relational_funcs/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/relational_funcs/common.hpp
 delete mode 100644 test_conformance/clcpp/relational_funcs/comparison_funcs.hpp
 delete mode 100644 test_conformance/clcpp/relational_funcs/main.cpp
 delete mode 100644 test_conformance/clcpp/relational_funcs/select_funcs.hpp
 delete mode 100644 test_conformance/clcpp/relational_funcs/test_funcs.hpp
 delete mode 100644 test_conformance/clcpp/spec_constants/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/spec_constants/common.hpp
 delete mode 100644 test_conformance/clcpp/spec_constants/main.cpp
 delete mode 100644 test_conformance/clcpp/spec_constants/test_spec_consts_attributes.hpp
 delete mode 100644 test_conformance/clcpp/spec_constants/test_spec_consts_if.hpp
 delete mode 100644 test_conformance/clcpp/spec_constants/test_spec_consts_init_vars.hpp
 delete mode 100644 test_conformance/clcpp/spirv10_2016.04.27.7z
 delete mode 100644 test_conformance/clcpp/subgroups/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/subgroups/common.hpp
 delete mode 100644 test_conformance/clcpp/subgroups/main.cpp
 delete mode 100644 test_conformance/clcpp/subgroups/test_sg_all.hpp
 delete mode 100644 test_conformance/clcpp/subgroups/test_sg_any.hpp
 delete mode 100644 test_conformance/clcpp/subgroups/test_sg_broadcast.hpp
 delete mode 100644 test_conformance/clcpp/subgroups/test_sg_reduce.hpp
 delete mode 100644 test_conformance/clcpp/subgroups/test_sg_scan_exclusive.hpp
 delete mode 100644 test_conformance/clcpp/subgroups/test_sg_scan_inclusive.hpp
 delete mode 100644 test_conformance/clcpp/synchronization/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/synchronization/main.cpp
 delete mode 100644 test_conformance/clcpp/synchronization/named_barrier/common.hpp
 delete mode 100644 test_conformance/clcpp/synchronization/named_barrier/test_named_barrier.hpp
 delete mode 100644 test_conformance/clcpp/synchronization/named_barrier/test_spec_example.hpp
 delete mode 100644 test_conformance/clcpp/synchronization/test_sub_group_barrier.hpp
 delete mode 100644 test_conformance/clcpp/synchronization/test_work_group_barrier.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/errors.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/is_vector_type.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/kernel_helpers.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/make_vector_type.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/scalar_type.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/string.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/type_name.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/type_supported.hpp
 delete mode 100644 test_conformance/clcpp/utils_common/vector_size.hpp
 delete mode 100644 test_conformance/clcpp/utils_test/binary.hpp
 delete mode 100644 test_conformance/clcpp/utils_test/compare.hpp
 delete mode 100644 test_conformance/clcpp/utils_test/detail/base_func_type.hpp
 delete mode 100644 test_conformance/clcpp/utils_test/detail/vec_helpers.hpp
 delete mode 100644 test_conformance/clcpp/utils_test/generate_inputs.hpp
 delete mode 100644 test_conformance/clcpp/utils_test/ternary.hpp
 delete mode 100644 test_conformance/clcpp/utils_test/unary.hpp
 delete mode 100644 test_conformance/clcpp/vload_vstore/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/vload_vstore/common.hpp
 delete mode 100644 test_conformance/clcpp/vload_vstore/half_utils.hpp
 delete mode 100644 test_conformance/clcpp/vload_vstore/main.cpp
 delete mode 100644 test_conformance/clcpp/vload_vstore/vload_funcs.hpp
 delete mode 100644 test_conformance/clcpp/vload_vstore/vstore_funcs.hpp
 delete mode 100644 test_conformance/clcpp/workgroups/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/workgroups/common.hpp
 delete mode 100644 test_conformance/clcpp/workgroups/main.cpp
 delete mode 100644 test_conformance/clcpp/workgroups/test_wg_all.hpp
 delete mode 100644 test_conformance/clcpp/workgroups/test_wg_any.hpp
 delete mode 100644 test_conformance/clcpp/workgroups/test_wg_broadcast.hpp
 delete mode 100644 test_conformance/clcpp/workgroups/test_wg_reduce.hpp
 delete mode 100644 test_conformance/clcpp/workgroups/test_wg_scan_exclusive.hpp
 delete mode 100644 test_conformance/clcpp/workgroups/test_wg_scan_inclusive.hpp
 delete mode 100644 test_conformance/clcpp/workitems/CMakeLists.txt
 delete mode 100644 test_conformance/clcpp/workitems/main.cpp
 delete mode 100644 test_conformance/clcpp/workitems/test_workitems.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d32692c9d..083ea96d5f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,58 +62,13 @@ set(CONFORMANCE_SUFFIX "" )
 #build driver as a dependency of the conformance tests, or other such CMake customization
 include(CMakeVendor.txt OPTIONAL)
 
-#-----------------------------------------------------------
-# Development options for OpenCL C++ tests
-#-----------------------------------------------------------
-# Use OpenCL C kernels instead of OpenCL C++ kernels
-option(CLPP_DEVELOPMENT_USE_OPENCLC_KERNELS "Use OpenCL C kernels in OpenCL C++ tests" OFF)
-if(CLPP_DEVELOPMENT_USE_OPENCLC_KERNELS)
-  set(CLPP_DEVELOPMENT_OPTIONS ${CLPP_DEVELOPMENT_OPTIONS} -DCLPP_DEVELOPMENT_USE_OPENCLC_KERNELS)
-endif(CLPP_DEVELOPMENT_USE_OPENCLC_KERNELS)
-# Only check if OpenCL C++ kernels compile to SPIR-V
-option(CLPP_DEVELOPMENT_ONLY_SPIRV_COMPILATION "Only check if OpenCL C++ kernels compile to SPIR-V" OFF)
-if(CLPP_DEVELOPMENT_ONLY_SPIRV_COMPILATION)
-  if(CLPP_DEVELOPMENT_USE_OPENCLC_KERNELS)
-    message(FATAL_ERROR "Can't use OpenCL C kernels and compile to SPIR-V.")
-  endif(CLPP_DEVELOPMENT_USE_OPENCLC_KERNELS)
-  set(CLPP_DEVELOPMENT_OPTIONS ${CLPP_DEVELOPMENT_OPTIONS} -DCLPP_DEVELOPMENT_ONLY_SPIRV_COMPILATION)
-endif(CLPP_DEVELOPMENT_ONLY_SPIRV_COMPILATION)
-#
-if(CLPP_DEVELOPMENT_OPTIONS)
-  add_definitions(-DCLPP_DEVELOPMENT_OPTIONS)
-  add_definitions(${CLPP_DEVELOPMENT_OPTIONS})
-endif(CLPP_DEVELOPMENT_OPTIONS)
-
-# Offline OpenCL C/C++ compiler provided by Khronos is the only supported
-# offline compiler.
-#
-# Path to offline OpenCL C/C++ compiler provided by Khronos.
-# See https://github.com/KhronosGroup/SPIR/ (spirv-1.1 branch or newer SPIR-V-ready
-# branch should be used).
-if(KHRONOS_OFFLINE_COMPILER)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DKHRONOS_OFFLINE_COMPILER=${KHRONOS_OFFLINE_COMPILER}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKHRONOS_OFFLINE_COMPILER=${KHRONOS_OFFLINE_COMPILER}")
-    # Additional OpenCL C/C++ compiler option.
-    if(KHRONOS_OFFLINE_COMPILER_OPTIONS)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DKHRONOS_OFFLINE_COMPILER_OPTIONS=${KHRONOS_OFFLINE_COMPILER_OPTIONS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKHRONOS_OFFLINE_COMPILER_OPTIONS=${KHRONOS_OFFLINE_COMPILER_OPTIONS}")
-    endif(KHRONOS_OFFLINE_COMPILER_OPTIONS)
-else(KHRONOS_OFFLINE_COMPILER)
-    message(WARNING "KHRONOS_OFFLINE_COMPILER is not defined!")
-    message(WARNING "Running CL C++ tests will not be possible.")
-endif(KHRONOS_OFFLINE_COMPILER)
-
-# CL_LIBCLCXX_DIR - path to dir with OpenCL C++ STL (libclcxx)
 # CL_INCLUDE_DIR - path to dir with OpenCL headers
-# CL_LIBCLCXX_DIR - path to dir with OpenCL library
-if(CL_INCLUDE_DIR AND CL_LIB_DIR AND CL_LIBCLCXX_DIR)
+if(CL_INCLUDE_DIR AND CL_LIB_DIR)
     link_directories(${CL_LIB_DIR})
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DCL_LIBCLCXX_DIR=${CL_LIBCLCXX_DIR}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCL_LIBCLCXX_DIR=${CL_LIBCLCXX_DIR}")
-else(CL_INCLUDE_DIR AND CL_LIB_DIR AND CL_LIBCLCXX_DIR)
+else(CL_INCLUDE_DIR AND CL_LIB_DIR)
     message(STATUS "OpenCL hasn't been found!")
-    message(FATAL_ERROR "Either install OpenCL or pass -DCL_INCLUDE_DIR, -DCL_LIB_DIR and -DCL_LIBCLCXX_DIR")
-endif(CL_INCLUDE_DIR AND CL_LIB_DIR AND CL_LIBCLCXX_DIR)
+    message(FATAL_ERROR "Either install OpenCL or pass -DCL_INCLUDE_DIR and -DCL_LIB_DIR")
+endif(CL_INCLUDE_DIR AND CL_LIB_DIR)
 
 # CLConform_GL_LIBRARIES_DIR - path to OpenGL libraries
 if(GL_IS_SUPPORTED AND CLConform_GL_LIBRARIES_DIR)
diff --git a/presubmit.sh b/presubmit.sh
index b0aa934f1f..6fc037c8d5 100755
--- a/presubmit.sh
+++ b/presubmit.sh
@@ -55,17 +55,13 @@ cd build
 cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} -DOPENCL_ICD_LOADER_HEADERS_DIR=${TOP}/OpenCL-Headers/ ..
 make
 
-# Get libclcxx
-cd ${TOP}
-git clone https://github.com/KhronosGroup/libclcxx.git
-
 # Build CTS
+cd ${TOP}
 ls -l
 mkdir build
 cd build
 cmake -DCL_INCLUDE_DIR=${TOP}/OpenCL-Headers \
       -DCL_LIB_DIR=${TOP}/OpenCL-ICD-Loader/build \
-      -DCL_LIBCLCXX_DIR=${TOP}/libclcxx \
       -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \
       -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=./bin \
       -DOPENCL_LIBRARIES="-lOpenCL -lpthread" \
diff --git a/test_common/harness/kernelHelpers.cpp b/test_common/harness/kernelHelpers.cpp
index 6ccdcc6e6f..f2d2909dba 100644
--- a/test_common/harness/kernelHelpers.cpp
+++ b/test_common/harness/kernelHelpers.cpp
@@ -312,57 +312,6 @@ get_compilation_mode_str(const CompilationMode compilationMode)
     }
 }
 
-#ifdef KHRONOS_OFFLINE_COMPILER
-static std::string
-get_khronos_compiler_command(const cl_uint device_address_space_size,
-                             const bool openclCXX, const std::string &bOptions,
-                             const std::string &sourceFilename,
-                             const std::string &outputFilename)
-{
-    // Set compiler options
-    // Emit SPIR-V
-    std::string compilerOptions = " -cc1 -emit-spirv";
-    // <triple>: for 32 bit SPIR-V use spir-unknown-unknown, for 64 bit SPIR-V
-    // use spir64-unknown-unknown.
-    if (device_address_space_size == 32)
-    {
-        compilerOptions += " -triple=spir-unknown-unknown";
-    }
-    else
-    {
-        compilerOptions += " -triple=spir64-unknown-unknown";
-    }
-    // Set OpenCL C++ flag required by SPIR-V-ready clang (compiler provided by
-    // Khronos)
-    if (openclCXX)
-    {
-        compilerOptions = compilerOptions + " -cl-std=c++";
-    }
-    // Set correct includes
-    if (openclCXX)
-    {
-        compilerOptions += " -I ";
-        compilerOptions += STRINGIFY_VALUE(CL_LIBCLCXX_DIR);
-    }
-    else
-    {
-        compilerOptions += " -include opencl.h";
-    }
-
-#ifdef KHRONOS_OFFLINE_COMPILER_OPTIONS
-    compilerOptions += STRINGIFY_VALUE(KHRONOS_OFFLINE_COMPILER_OPTIONS);
-#endif
-
-    // Add build options passed to this function
-    compilerOptions += " " + bOptions;
-    compilerOptions += " " + sourceFilename + " -o " + outputFilename;
-    std::string runString =
-        STRINGIFY_VALUE(KHRONOS_OFFLINE_COMPILER) + compilerOptions;
-
-    return runString;
-}
-#endif // KHRONOS_OFFLINE_COMPILER
-
 static cl_int get_cl_device_info_str(const cl_device_id device,
                                      const cl_uint device_address_space_size,
                                      const CompilationMode compilationMode,
@@ -476,50 +425,28 @@ static int invoke_offline_compiler(const cl_device_id device,
                                    const CompilationMode compilationMode,
                                    const std::string &bOptions,
                                    const std::string &sourceFilename,
-                                   const std::string &outputFilename,
-                                   const bool openclCXX)
+                                   const std::string &outputFilename)
 {
     std::string runString;
-    if (openclCXX)
-    {
-#ifndef KHRONOS_OFFLINE_COMPILER
-        log_error("CL C++ compilation is not possible: "
-                  "KHRONOS_OFFLINE_COMPILER was not defined.\n");
-        return CL_INVALID_OPERATION;
-#else
-        if (compilationMode != kSpir_v)
-        {
-            log_error("Compilation mode must be SPIR-V for Khronos compiler");
-            return -1;
-        }
-        runString = get_khronos_compiler_command(
-            device_address_space_size, openclCXX, bOptions, sourceFilename,
-            outputFilename);
-#endif
-    }
-    else
-    {
-        std::string clDeviceInfoFilename;
-
-        // See cl_offline_compiler-interface.txt for a description of the
-        // format of the CL device information file generated below, and
-        // the internal command line interface for invoking the offline
-        // compiler.
+    std::string clDeviceInfoFilename;
 
-        cl_int err =
-            write_cl_device_info(device, device_address_space_size,
-                                 compilationMode, clDeviceInfoFilename);
-        if (err != CL_SUCCESS)
-        {
-            log_error("Failed writing CL device info file\n");
-            return err;
-        }
+    // See cl_offline_compiler-interface.txt for a description of the
+    // format of the CL device information file generated below, and
+    // the internal command line interface for invoking the offline
+    // compiler.
 
-        runString = get_offline_compilation_command(
-            device_address_space_size, compilationMode, bOptions,
-            sourceFilename, outputFilename, clDeviceInfoFilename);
+    cl_int err = write_cl_device_info(device, device_address_space_size,
+                                      compilationMode, clDeviceInfoFilename);
+    if (err != CL_SUCCESS)
+    {
+        log_error("Failed writing CL device info file\n");
+        return err;
     }
 
+    runString = get_offline_compilation_command(
+        device_address_space_size, compilationMode, bOptions, sourceFilename,
+        outputFilename, clDeviceInfoFilename);
+
     // execute script
     log_info("Executing command: %s\n", runString.c_str());
     fflush(stdout);
@@ -577,9 +504,8 @@ static cl_int get_device_address_bits(const cl_device_id device,
 
 static int get_offline_compiler_output(
     std::ifstream &ifs, const cl_device_id device, cl_uint deviceAddrSpaceSize,
-    const bool openclCXX, const CompilationMode compilationMode,
-    const std::string &bOptions, const std::string &kernelPath,
-    const std::string &kernelNamePrefix)
+    const CompilationMode compilationMode, const std::string &bOptions,
+    const std::string &kernelPath, const std::string &kernelNamePrefix)
 {
     std::string sourceFilename =
         get_cl_source_filename_with_path(kernelPath, kernelNamePrefix);
@@ -599,9 +525,9 @@ static int get_offline_compiler_output(
         }
         else
         {
-            int error = invoke_offline_compiler(
-                device, deviceAddrSpaceSize, compilationMode, bOptions,
-                sourceFilename, outputFilename, openclCXX);
+            int error = invoke_offline_compiler(device, deviceAddrSpaceSize,
+                                                compilationMode, bOptions,
+                                                sourceFilename, outputFilename);
             if (error != CL_SUCCESS) return error;
 
             // read output file
@@ -620,8 +546,7 @@ static int get_offline_compiler_output(
 static int create_single_kernel_helper_create_program_offline(
     cl_context context, cl_device_id device, cl_program *outProgram,
     unsigned int numKernelLines, const char *const *kernelProgram,
-    const char *buildOptions, const bool openclCXX,
-    CompilationMode compilationMode)
+    const char *buildOptions, CompilationMode compilationMode)
 {
     if (kCacheModeDumpCl == gCompilationCacheMode)
     {
@@ -649,22 +574,10 @@ static int create_single_kernel_helper_create_program_offline(
 
     std::ifstream ifs;
     error = get_offline_compiler_output(ifs, device, device_address_space_size,
-                                        openclCXX, compilationMode, bOptions,
+                                        compilationMode, bOptions,
                                         gCompilationCachePath, kernelName);
     if (error != CL_SUCCESS) return error;
 
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT
-// ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    if (openclCXX)
-    {
-        return CL_SUCCESS;
-    }
-#endif
-
     ifs.seekg(0, ifs.end);
     int length = ifs.tellg();
     ifs.seekg(0, ifs.beg);
@@ -748,8 +661,7 @@ static int create_single_kernel_helper_create_program_offline(
 static int create_single_kernel_helper_create_program(
     cl_context context, cl_device_id device, cl_program *outProgram,
     unsigned int numKernelLines, const char **kernelProgram,
-    const char *buildOptions, const bool openclCXX,
-    CompilationMode compilationMode)
+    const char *buildOptions, CompilationMode compilationMode)
 {
     std::lock_guard<std::mutex> compiler_lock(gCompilerMutex);
 
@@ -787,37 +699,39 @@ static int create_single_kernel_helper_create_program(
     {
         return create_single_kernel_helper_create_program_offline(
             context, device, outProgram, numKernelLines, kernelProgram,
-            buildOptions, openclCXX, compilationMode);
+            buildOptions, compilationMode);
     }
 }
 
-int create_single_kernel_helper_create_program(
-    cl_context context, cl_program *outProgram, unsigned int numKernelLines,
-    const char **kernelProgram, const char *buildOptions, const bool openclCXX)
+int create_single_kernel_helper_create_program(cl_context context,
+                                               cl_program *outProgram,
+                                               unsigned int numKernelLines,
+                                               const char **kernelProgram,
+                                               const char *buildOptions)
 {
     return create_single_kernel_helper_create_program(
         context, NULL, outProgram, numKernelLines, kernelProgram, buildOptions,
-        openclCXX, gCompilationMode);
+        gCompilationMode);
 }
 
 int create_single_kernel_helper_create_program_for_device(
     cl_context context, cl_device_id device, cl_program *outProgram,
     unsigned int numKernelLines, const char **kernelProgram,
-    const char *buildOptions, const bool openclCXX)
+    const char *buildOptions)
 {
     return create_single_kernel_helper_create_program(
         context, device, outProgram, numKernelLines, kernelProgram,
-        buildOptions, openclCXX, gCompilationMode);
+        buildOptions, gCompilationMode);
 }
 
 int create_single_kernel_helper_with_build_options(
     cl_context context, cl_program *outProgram, cl_kernel *outKernel,
     unsigned int numKernelLines, const char **kernelProgram,
-    const char *kernelName, const char *buildOptions, const bool openclCXX)
+    const char *kernelName, const char *buildOptions)
 {
     return create_single_kernel_helper(context, outProgram, outKernel,
                                        numKernelLines, kernelProgram,
-                                       kernelName, buildOptions, openclCXX);
+                                       kernelName, buildOptions);
 }
 
 // Creates and builds OpenCL C/C++ program, and creates a kernel
@@ -826,7 +740,7 @@ int create_single_kernel_helper(cl_context context, cl_program *outProgram,
                                 unsigned int numKernelLines,
                                 const char **kernelProgram,
                                 const char *kernelName,
-                                const char *buildOptions, const bool openclCXX)
+                                const char *buildOptions)
 {
     // For the logic that automatically adds -cl-std it is much cleaner if the
     // build options have RAII. This buffer will store the potentially updated
@@ -865,51 +779,14 @@ int create_single_kernel_helper(cl_context context, cl_program *outProgram,
         build_options_internal += cl_std;
         buildOptions = build_options_internal.c_str();
     }
-    int error;
-    // Create OpenCL C++ program
-    if (openclCXX)
-    {
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT
-// ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-        // Save global variable
-        bool tempgCompilationCacheMode = gCompilationCacheMode;
-        // Force OpenCL C++ -> SPIR-V compilation on every run
-        gCompilationCacheMode = kCacheModeOverwrite;
-#endif
-        error = create_openclcpp_program(context, outProgram, numKernelLines,
-                                         kernelProgram, buildOptions);
-        if (error != CL_SUCCESS)
-        {
-            log_error("Create program failed: %d, line: %d\n", error, __LINE__);
-            return error;
-        }
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT
-// ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-        // Restore global variables
-        gCompilationCacheMode = tempgCompilationCacheMode;
-        log_info("WARNING: KERNEL %s WAS ONLY COMPILED TO SPIR-V\n",
-                 kernelName);
-        return error;
-#endif
-    }
-    // Create OpenCL C program
-    else
+    int error = create_single_kernel_helper_create_program(
+        context, outProgram, numKernelLines, kernelProgram, buildOptions);
+    if (error != CL_SUCCESS)
     {
-        error = create_single_kernel_helper_create_program(
-            context, outProgram, numKernelLines, kernelProgram, buildOptions);
-        if (error != CL_SUCCESS)
-        {
-            log_error("Create program failed: %d, line: %d\n", error, __LINE__);
-            return error;
-        }
+        log_error("Create program failed: %d, line: %d\n", error, __LINE__);
+        return error;
     }
+
     // Remove offline-compiler-only build options
     std::string newBuildOptions;
     if (buildOptions != NULL)
@@ -930,18 +807,6 @@ int create_single_kernel_helper(cl_context context, cl_program *outProgram,
         kernelName, newBuildOptions.c_str());
 }
 
-// Creates OpenCL C++ program
-int create_openclcpp_program(cl_context context, cl_program *outProgram,
-                             unsigned int numKernelLines,
-                             const char **kernelProgram,
-                             const char *buildOptions)
-{
-    // Create program
-    return create_single_kernel_helper_create_program(
-        context, NULL, outProgram, numKernelLines, kernelProgram, buildOptions,
-        true, kSpir_v);
-}
-
 // Builds OpenCL C/C++ program and creates
 int build_program_create_kernel_helper(
     cl_context context, cl_program *outProgram, cl_kernel *outKernel,
diff --git a/test_common/harness/kernelHelpers.h b/test_common/harness/kernelHelpers.h
index d10d44ed2f..f971a8f2b0 100644
--- a/test_common/harness/kernelHelpers.h
+++ b/test_common/harness/kernelHelpers.h
@@ -72,24 +72,21 @@ extern int
 create_single_kernel_helper(cl_context context, cl_program *outProgram,
                             cl_kernel *outKernel, unsigned int numKernelLines,
                             const char **kernelProgram, const char *kernelName,
-                            const char *buildOptions = NULL,
-                            const bool openclCXX = false);
+                            const char *buildOptions = NULL);
 
 extern int create_single_kernel_helper_with_build_options(
     cl_context context, cl_program *outProgram, cl_kernel *outKernel,
     unsigned int numKernelLines, const char **kernelProgram,
-    const char *kernelName, const char *buildOptions,
-    const bool openclCXX = false);
+    const char *kernelName, const char *buildOptions);
 
 extern int create_single_kernel_helper_create_program(
     cl_context context, cl_program *outProgram, unsigned int numKernelLines,
-    const char **kernelProgram, const char *buildOptions = NULL,
-    const bool openclCXX = false);
+    const char **kernelProgram, const char *buildOptions = NULL);
 
 extern int create_single_kernel_helper_create_program_for_device(
     cl_context context, cl_device_id device, cl_program *outProgram,
     unsigned int numKernelLines, const char **kernelProgram,
-    const char *buildOptions = NULL, const bool openclCXX = false);
+    const char *buildOptions = NULL);
 
 /* Creates OpenCL C++ program. This one must be used for creating OpenCL C++
  * program. */
diff --git a/test_conformance/CMakeLists.txt b/test_conformance/CMakeLists.txt
index 87d68597f0..363ece8698 100644
--- a/test_conformance/CMakeLists.txt
+++ b/test_conformance/CMakeLists.txt
@@ -50,9 +50,6 @@ add_subdirectory( subgroups )
 add_subdirectory( workgroups )
 add_subdirectory( pipes )
 add_subdirectory( device_timer )
-if(KHRONOS_OFFLINE_COMPILER)
-    add_subdirectory( clcpp )
-endif()
 add_subdirectory( spirv_new )
 add_subdirectory( spir )
 
diff --git a/test_conformance/clcpp/CMakeLists.txt b/test_conformance/clcpp/CMakeLists.txt
deleted file mode 100644
index 04484e7adf..0000000000
--- a/test_conformance/clcpp/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-add_subdirectory(address_spaces)
-add_subdirectory(api)
-add_subdirectory(atomics)
-add_subdirectory(attributes)
-add_subdirectory(common_funcs)
-add_subdirectory(convert)
-add_subdirectory(device_queue)
-add_subdirectory(geometric_funcs)
-add_subdirectory(images)
-add_subdirectory(integer_funcs)
-add_subdirectory(math_funcs)
-add_subdirectory(pipes)
-add_subdirectory(program_scope_ctors_dtors)
-add_subdirectory(reinterpret)
-add_subdirectory(relational_funcs)
-add_subdirectory(spec_constants)
-add_subdirectory(subgroups)
-add_subdirectory(synchronization)
-add_subdirectory(vload_vstore)
-add_subdirectory(workgroups)
-add_subdirectory(workitems)
diff --git a/test_conformance/clcpp/address_spaces/CMakeLists.txt b/test_conformance/clcpp/address_spaces/CMakeLists.txt
deleted file mode 100644
index 2b6369f65c..0000000000
--- a/test_conformance/clcpp/address_spaces/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_ADDRESS_SPACES)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/address_spaces/common.hpp b/test_conformance/clcpp/address_spaces/common.hpp
deleted file mode 100644
index fad7ba96a4..0000000000
--- a/test_conformance/clcpp/address_spaces/common.hpp
+++ /dev/null
@@ -1,202 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_COMMON_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#define RUN_ADDRESS_SPACES_TEST_MACRO(TEST_CLASS) \
-    last_error = run_address_spaces_test(  \
-        device, context, queue, n_elems, TEST_CLASS \
-    );  \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-// This is a base class for address spaces tests.
-template <class T>
-struct address_spaces_test : public detail::base_func_type<T>
-{
-    // output buffer type
-    typedef T type;
-
-    virtual ~address_spaces_test() {};
-    // Returns test name
-    virtual std::string str() = 0;
-    // Returns OpenCL program source
-    virtual std::string generate_program() = 0;
-    // Returns kernel names IN ORDER
-    virtual std::vector<std::string> get_kernel_names()
-    {
-        // Typical case, that is, only one kernel
-        return { this->get_kernel_name() };
-    }
-
-    // Return value that is expected to be in output_buffer[i]
-    virtual T operator()(size_t i, size_t work_group_size) = 0;
-
-    // If local size has to be set in clEnqueueNDRangeKernel()
-    // this should return true; otherwise - false;
-    virtual bool set_local_size()
-    {
-        return false;
-    }
-
-    // Calculates maximal work-group size (one dim)
-    virtual size_t get_max_local_size(const std::vector<cl_kernel>& kernels,
-                                      cl_device_id device,
-                                      size_t work_group_size, // default work-group size
-                                      cl_int& error)
-    {
-        size_t wg_size = work_group_size;
-        for(auto&k : kernels)
-        {
-            size_t max_wg_size;
-            error = clGetKernelWorkGroupInfo(k, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL);
-            RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-            wg_size = (std::min)(max_wg_size, wg_size);
-        }
-        return wg_size;
-    }
-
-    // This covers typical case: each kernel is executed once, every kernel
-    // has only one argument which is output buffer
-    virtual cl_int execute(const std::vector<cl_kernel>& kernels,
-                           cl_mem& output_buffer,
-                           cl_command_queue& queue,
-                           size_t work_size,
-                           size_t work_group_size)
-    {
-        cl_int err;
-        for(auto& k : kernels)
-        {
-            err = clSetKernelArg(k, 0, sizeof(output_buffer), &output_buffer);
-            RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-            err = clEnqueueNDRangeKernel(
-                queue, k, 1,
-                NULL, &work_size, this->set_local_size() ? &work_group_size : NULL,
-                0, NULL, NULL
-            );
-            RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-        }
-        return err;
-    }
-};
-
-template <class address_spaces_test>
-int run_address_spaces_test(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, address_spaces_test op)
-{
-    cl_mem buffers[1];
-    cl_program program;
-    std::vector<cl_kernel> kernels;
-    size_t wg_size;
-    size_t work_size[1];
-    cl_int err;
-
-    typedef typename address_spaces_test::type TYPE;
-
-    // Don't run test for unsupported types
-    if(!(type_supported<TYPE>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = op.generate_program();
-    std::vector<std::string> kernel_names = op.get_kernel_names();
-    if(kernel_names.empty())
-    {
-        RETURN_ON_ERROR_MSG(-1, "No kernel to run");
-    }
-    kernels.resize(kernel_names.size());
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0]);
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0], "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#else
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0]);
-    RETURN_ON_ERROR(err)
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#endif
-
-    // Find the max possible wg size for among all the kernels
-    wg_size = op.get_max_local_size(kernels, device, 1024, err);
-    RETURN_ON_ERROR(err);
-
-    work_size[0] = count;
-    if(op.set_local_size())
-    {
-        size_t wg_number = static_cast<size_t>(
-            std::ceil(static_cast<double>(count) / wg_size)
-        );
-        work_size[0] = wg_number * wg_size;
-    }
-
-    // output on host
-    std::vector<TYPE> output = generate_output<TYPE>(work_size[0], 9999);
-
-    // output buffer
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    // Execute test
-    err = op.execute(kernels, buffers[0], queue, work_size[0], wg_size);
-    RETURN_ON_ERROR(err)
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    for(size_t i = 0; i < output.size(); i++)
-    {
-        TYPE v = op(i, wg_size);
-        if(!(are_equal(v, output[i], detail::make_value<TYPE>(0), op)))
-        {
-            RETURN_ON_ERROR_MSG(-1,
-                "test_%s(%s) failed. Expected: %s, got: %s", op.str().c_str(), type_name<TYPE>().c_str(),
-                format_value(v).c_str(), format_value(output[i]).c_str()
-            );
-        }
-    }
-    log_info("test_%s(%s) passed\n", op.str().c_str(), type_name<TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    for(auto& k : kernels)
-        clReleaseKernel(k);
-    clReleaseProgram(program);
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_COMMON_HPP
diff --git a/test_conformance/clcpp/address_spaces/main.cpp b/test_conformance/clcpp/address_spaces/main.cpp
deleted file mode 100644
index d618e17977..0000000000
--- a/test_conformance/clcpp/address_spaces/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_pointer_types.hpp"
-#include "test_storage_types.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/address_spaces/test_pointer_types.hpp b/test_conformance/clcpp/address_spaces/test_pointer_types.hpp
deleted file mode 100644
index af228d0d94..0000000000
--- a/test_conformance/clcpp/address_spaces/test_pointer_types.hpp
+++ /dev/null
@@ -1,412 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_TEST_POINTER_TYPES_HPP
-#define TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_TEST_POINTER_TYPES_HPP
-
-#include <type_traits>
-
-#include "common.hpp"
-
-// ----------------------------
-// ---------- PRIVATE
-// ----------------------------
-
-template <class T>
-struct private_pointer_test : public address_spaces_test<T>
-{
-    std::string str()
-    {
-        return "private_pointer";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        (void) work_group_size;
-        return detail::make_value<T>(static_cast<SCALAR>(i));
-    }
-
-    // Each work-item writes its global id to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global " + type_name<T>() + " *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(gid);\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<" + type_name<T>() + "[]> output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    typedef " + type_name<T>() + " TYPE;\n"
-                "    TYPE v = TYPE(gid);\n"
-                "    private_ptr<TYPE> v_ptr1(dynamic_asptr_cast<private_ptr<TYPE>>(&v));\n"
-                "    private_ptr<TYPE> v_ptr2(v_ptr1);\n"
-                "    TYPE a[] = { TYPE(0), TYPE(1) };\n"
-                "    private_ptr<TYPE> a_ptr = dynamic_asptr_cast<private_ptr<TYPE>>(a);\n"
-                "    a_ptr++;\n"
-                "    TYPE * a_ptr2 = a_ptr.get();\n"
-                "    *a_ptr2 = *v_ptr2;\n"
-                "    output[gid] = a[1];\n"
-                "}\n";        
-        #endif
-    }
-};
-
-AUTO_TEST_CASE(test_private_pointer)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // private pointer
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_pointer_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_pointer_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_pointer_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_pointer_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_pointer_test<cl_uint16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-// ----------------------------
-// ---------- LOCAL
-// ----------------------------
-
-template <class T>
-struct local_pointer_test : public address_spaces_test<T>
-{
-    std::string str()
-    {
-        return "local_pointer";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        size_t r = i / work_group_size;
-        return detail::make_value<T>(static_cast<SCALAR>(r));
-    }
-
-    bool set_local_size()
-    {
-        return true;
-    }
-
-    size_t get_max_local_size(const std::vector<cl_kernel>& kernels, 
-                              cl_device_id device,
-                              size_t work_group_size, // default work-group size
-                              cl_int& error)
-    {
-        // Set size of the local memory, we need to to this to correctly calculate
-        // max possible work-group size.
-        // Additionally this already set 2nd argument of the test kernel, so we don't
-        // have to modify execute() method.
-        error = clSetKernelArg(kernels[0], 1, sizeof(cl_uint), NULL);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg");
-
-        size_t wg_size;
-        error = clGetKernelWorkGroupInfo(
-            kernels[0], device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-        wg_size = wg_size <= work_group_size ? wg_size : work_group_size;        
-        return wg_size;
-    }
-
-    // Every work-item writes id of its work-group to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global " + type_name<T>() + " *output, "
-                                                              "local uint * local_mem_ptr)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(get_group_id(0));\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_synchronization>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<" + type_name<T>() + "[]> output, "
-                                                              "local_ptr<uint[]> local_mem_ptr)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    size_t lid = get_local_id(0);\n"
-                "    typedef " + type_name<T>() + " TYPE;\n"
-                // 1st work-item in work-group writes get_group_id() to var
-                "    local<uint> var;\n"
-                "    local_ptr<uint> var_ptr = var.ptr();\n"
-                "    if(lid == 0) { *var_ptr = get_group_id(0); }\n"
-                "    work_group_barrier(mem_fence::local);\n"
-                // last work-item in work-group writes var to 1st element of local_mem
-                "    local_ptr<uint[]> local_mem_ptr2(local_mem_ptr);\n"
-                "    auto local_mem_ptr3 = local_mem_ptr2.release();\n"
-                "    if(lid == (get_local_size(0) - 1)) { *(local_mem_ptr3) = var; }\n"
-                "    work_group_barrier(mem_fence::local);\n"
-                // each work-item in work-group writes local_mem_ptr[0] to output[work-item-global-id]
-                "    output[gid] = local_mem_ptr[0];\n"
-                "}\n";        
-        #endif
-    }
-};
-
-AUTO_TEST_CASE(test_local_pointer)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // local pointer
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_pointer_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_pointer_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_pointer_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_pointer_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_pointer_test<cl_uint16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-// ----------------------------
-// ---------- GLOBAL
-// ----------------------------
-
-template <class T>
-struct global_pointer_test : public address_spaces_test<T>
-{
-    std::string str()
-    {
-        return "global_pointer";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        (void) work_group_size;
-        return detail::make_value<T>(static_cast<SCALAR>(i));
-    }
-
-    // Each work-item writes its global id to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global " + type_name<T>() + " *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(gid);\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                "typedef " + type_name<T>() + " TYPE;\n"
-                "void set_to_gid(global_ptr<TYPE> ptr)\n"
-                "{\n"
-                "    *ptr = TYPE(get_global_id(0));"
-                "}\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<TYPE[]> output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    auto ptr = output.get();\n"
-                "    global_ptr<TYPE> ptr2(ptr);\n"
-                "    ptr2 += ptrdiff_t(gid);\n"
-                "    set_to_gid(ptr2);\n"
-                "}\n";        
-        #endif
-    }
-};
-
-AUTO_TEST_CASE(test_global_pointer)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // global pointer
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_pointer_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_pointer_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_pointer_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_pointer_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_pointer_test<cl_uint16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-// ----------------------------
-// ---------- CONSTANT
-// ----------------------------
-
-template <class T>
-struct constant_pointer_test : public address_spaces_test<T>
-{
-    // m_test_value is just a random value we use in this test.
-    constant_pointer_test() : m_test_value(0xdeaddeadU)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "constant_pointer";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        (void) work_group_size;
-        return detail::make_value<T>(static_cast<SCALAR>(m_test_value));
-    }
-
-    // Each work-item writes m_test_value to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global " + type_name<T>() + " *output, "
-                                                              "constant uint * const_ptr)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(const_ptr[0]);\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                "typedef " + type_name<T>() + " TYPE;\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<TYPE[]> output, "
-                                                              "constant_ptr<uint[]> const_ptr)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    constant_ptr<uint[]> const_ptr2 = const_ptr;\n"
-                "    auto const_ptr3 = const_ptr2.get();\n"
-                "    output[gid] = *const_ptr3;\n"
-                "}\n";        
-        #endif
-    }
-
-    // execute() method needs to be modified, to create additional buffer
-    // and set it in 2nd arg (constant_ptr<uint[]> const_ptr)
-    cl_int execute(const std::vector<cl_kernel>& kernels,
-                   cl_mem& output_buffer,
-                   cl_command_queue& queue,
-                   size_t work_size,
-                   size_t work_group_size)
-    {           
-        cl_int err;
-
-        // Get context from queue
-        cl_context context;
-        err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
-        RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo");
-
-        // Create constant buffer
-        auto const_buff = clCreateBuffer(context, CL_MEM_READ_ONLY,
-                                         sizeof(cl_uint), NULL, &err);
-        RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-        // Write m_test_value to const_buff
-        err = clEnqueueWriteBuffer(
-            queue, const_buff, CL_TRUE, 0, sizeof(cl_uint),
-            static_cast<void *>(&m_test_value), 0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-        err = clSetKernelArg(kernels[0], 0, sizeof(output_buffer), &output_buffer);
-        err |= clSetKernelArg(kernels[0], 1, sizeof(const_buff), &const_buff);
-        RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-        err = clEnqueueNDRangeKernel(
-            queue, kernels[0], 1, NULL, &work_size, this->set_local_size() ? &work_group_size : NULL, 0, NULL, NULL
-        );      
-        RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-        err = clFinish(queue);
-        RETURN_ON_CL_ERROR(err, "clFinish");
-
-        err = clReleaseMemObject(const_buff);
-        RETURN_ON_CL_ERROR(err, "clReleaseMemObject");
-        return err;
-    }
-
-private:
-    cl_uint m_test_value;
-};
-
-AUTO_TEST_CASE(test_constant_pointer)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // constant pointer
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_pointer_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_pointer_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_pointer_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_pointer_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_pointer_test<cl_uint16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_TEST_POINTER_TYPES_HPP
diff --git a/test_conformance/clcpp/address_spaces/test_storage_types.hpp b/test_conformance/clcpp/address_spaces/test_storage_types.hpp
deleted file mode 100644
index e47f9523e7..0000000000
--- a/test_conformance/clcpp/address_spaces/test_storage_types.hpp
+++ /dev/null
@@ -1,418 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_TEST_STORAGE_TYPES_HPP
-#define TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_TEST_STORAGE_TYPES_HPP
-
-#include <type_traits>
-
-#include "common.hpp"
-
-// ----------------------------
-// ---------- PRIVATE
-// ----------------------------
-
-template <class T>
-struct private_storage_test : public address_spaces_test<T>
-{
-    std::string str()
-    {
-        return "private_storage";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        (void) work_group_size;
-        return detail::make_value<T>(static_cast<SCALAR>(i));
-    }
-
-    // Each work-item writes its global id to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global " + type_name<T>() + " *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(gid);\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<" + type_name<T>() + "[]> output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    typedef " + type_name<T>() + " TYPE;\n"
-                "    priv<TYPE> v = { TYPE(gid) };\n"
-                "    const TYPE *v_ptr1 = &v;\n"
-                "    private_ptr<TYPE> v_ptr2 = v.ptr();\n"
-                "    TYPE v2 = *v_ptr2;\n"
-                "    priv<array<TYPE, 1>> a;\n"
-                "    *(a.begin()) = v2;\n"
-                "    output[gid] = a[0];\n"
-                "}\n";        
-        #endif
-    }
-};
-
-AUTO_TEST_CASE(test_private_storage)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // private storage
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_storage_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_storage_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_storage_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_storage_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(private_storage_test<cl_uint16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-// ----------------------------
-// ---------- LOCAL
-// ----------------------------
-
-template <class T>
-struct local_storage_test : public address_spaces_test<T>
-{
-    std::string str()
-    {
-        return "local_storage";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        size_t r = i / work_group_size;
-        return detail::make_value<T>(static_cast<SCALAR>(r));
-    }
-
-    bool set_local_size()
-    {
-        return true;
-    }
-
-    // Every work-item writes id of its work-group to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global " + type_name<T>() + " *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(get_group_id(0));\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_synchronization>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                // Using program scope local variable
-                "local<" + type_name<T>() + "> program_scope_var;"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<" + type_name<T>() + "[]> output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    size_t lid = get_local_id(0);\n"
-                "    typedef " + type_name<T>() + " TYPE;\n"
-                // 1st work-item in work-group writes get_group_id() to var
-                "    local<TYPE> var;\n"
-                "    if(lid == 0) { var = TYPE(get_group_id(0)); }\n"
-                "    work_group_barrier(mem_fence::local);\n"
-                // last work-item in work-group writes var to 1st element of a
-                "    local_ptr<TYPE> var_ptr = var.ptr();\n"
-                "    TYPE var2 = *var_ptr;\n"
-                "    local<array<TYPE, 1>> a;\n"
-                "    if(lid == (get_local_size(0) - 1)) { *(a.begin()) = var2; }\n"
-                "    work_group_barrier(mem_fence::local);\n"
-                // 1st work-item in work-group writes a[0] to program_scope_var
-                "    if(lid == 0) { program_scope_var = a[0]; }\n"
-                "    work_group_barrier(mem_fence::local);\n"
-                "    const TYPE *program_scope_var_ptr = &program_scope_var;\n"
-                "    output[gid] = *program_scope_var_ptr;\n"
-                "}\n";        
-        #endif
-    }
-};
-
-AUTO_TEST_CASE(test_local_storage)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // local storage
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_storage_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_storage_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_storage_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_storage_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(local_storage_test<cl_int16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-// ----------------------------
-// ---------- GLOBAL
-// ----------------------------
-
-template <class T>
-struct global_storage_test : public address_spaces_test<T>
-{
-    // m_test_value is just a random value we use in this test.
-    // m_test_value should not be zero.
-    global_storage_test() : m_test_value(0xdeaddeadU)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "global_storage";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        return detail::make_value<T>(static_cast<SCALAR>(m_test_value));
-    }
-
-    std::vector<std::string> get_kernel_names()
-    {
-        return 
-        {
-            this->get_kernel_name() + "1",
-            this->get_kernel_name() + "2"
-        };
-    }
-
-    // Every work-item writes m_test_value to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_names()[0] + "(global " + type_name<T>() + " *output, "
-                                                                  "uint test_value)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(test_value);\n"
-                "}\n"
-                "__kernel void " + this->get_kernel_names()[1] + "(global " + type_name<T>() + " *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = output[gid];\n"
-                "}\n";
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                "typedef " + type_name<T>() + " TYPE;\n"
-                // Using program scope global variable
-                "global<array<TYPE, 1>> program_scope_global_array;"
-                "__kernel void " + this->get_kernel_names()[0] + "(global_ptr<" + type_name<T>() + "[]> output, "
-                                                                  "uint test_value)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                // 1st work-item writes test_value to program_scope_global_array[0]
-                "    if(gid == 0) { program_scope_global_array[0] = test_value; }\n"
-                "}\n" 
-                "__kernel void " + this->get_kernel_names()[1] + "(global_ptr<" + type_name<T>() + "[]> output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    static global<uint> func_scope_global_var { 0 };\n"
-                // if (func_scope_global_var == 1) is true then
-                // each work-item saves program_scope_global_array[0] to output[work-item-global-id]
-                "    if(func_scope_global_var == uint(1))\n"
-                "    {\n"
-                "        output[gid] = program_scope_global_array[0];\n"
-                "        return;\n"
-                "    }\n"
-                // 1st work-item writes 1 to func_scope_global_var
-                "    if(gid == 0) { func_scope_global_var = uint(1); }\n"
-                "}\n";         
-        #endif
-    }
-
-    // In this test execution is quite complicated. We have two kernels.
-    // 1st kernel tests program scope global variable, and 2nd kernel tests 
-    // function scope global variable (that's why it is run twice).
-    cl_int execute(const std::vector<cl_kernel>& kernels,
-                   cl_mem& output_buffer,
-                   cl_command_queue& queue,
-                   size_t work_size,
-                   size_t wg_size)
-    {           
-        cl_int err;
-        err = clSetKernelArg(kernels[0], 0, sizeof(output_buffer), &output_buffer);
-        err |= clSetKernelArg(kernels[0], 1, sizeof(cl_uint), &m_test_value);
-        RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-        // Run first kernel, once.
-        // This kernel saves m_test_value to program scope global variable called program_scope_global_var
-        err = clEnqueueNDRangeKernel(
-            queue, kernels[0], 1, NULL, &work_size, this->set_local_size() ? &wg_size : NULL, 0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-        err = clFinish(queue);
-        RETURN_ON_CL_ERROR(err, "clFinish")
-
-        err = clSetKernelArg(kernels[1], 0, sizeof(output_buffer), &output_buffer);
-        // Run 2nd kernel, twice.
-        // 1st run: program_scope_global_var is saved to function scope global array called func_scope_global_array
-        // 2nd run: each work-item saves func_scope_global_array[0] to ouput[work-item-global-id]
-        for(size_t i = 0; i < 2; i++)
-        {
-            err = clEnqueueNDRangeKernel(
-                queue, kernels[1], 1, NULL, &work_size, this->set_local_size() ? &wg_size : NULL, 0, NULL, NULL
-            );
-            RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-            err = clFinish(queue);
-            RETURN_ON_CL_ERROR(err, "clFinish")
-        }
-        return err;
-    }
-
-private:
-    cl_uint m_test_value;
-};
-
-AUTO_TEST_CASE(test_global_storage)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_storage_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_storage_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_storage_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_storage_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(global_storage_test<cl_int16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-// ----------------------------
-// ---------- CONSTANT
-// ----------------------------
-
-template <class T>
-struct constant_storage_test : public address_spaces_test<T>
-{
-    // m_test_value is just a random value we use in this test.
-    constant_storage_test() : m_test_value(0xdeaddeadU)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "constant_storage";
-    }
-
-    T operator()(size_t i, size_t work_group_size)
-    {
-        typedef typename scalar_type<T>::type SCALAR;
-        return detail::make_value<T>(static_cast<SCALAR>(m_test_value));
-    }
-
-    // Every work-item writes m_test_value to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global " + type_name<T>() + " *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = (" + type_name<T>() + ")(" + std::to_string(m_test_value) + ");\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                // Program scope constant variable, program_scope_var == (m_test_value - 1)
-                "constant<uint> program_scope_const{ (" + std::to_string(m_test_value) + " - 1) };"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<" + type_name<T>() + "[]> output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    typedef " + type_name<T>() + " TYPE;\n"
-                "    static constant<uint> func_scope_const{ 1 };\n"
-                "    constant_ptr<uint> ps_const_ptr = program_scope_const.ptr();\n"
-                // "    constant_ptr<array<uint, 1>> fs_const_ptr = &func_scope_const;\n"
-                "    output[gid] = TYPE(*ps_const_ptr + func_scope_const);\n"
-                "}\n";        
-        #endif
-    }
-private:
-    cl_uint m_test_value;
-};
-
-AUTO_TEST_CASE(test_constant_storage)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_storage_test<cl_uint>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_storage_test<cl_float2>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_storage_test<cl_float4>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_storage_test<cl_float8>());
-    RUN_ADDRESS_SPACES_TEST_MACRO(constant_storage_test<cl_int16>());
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_ADDRESS_SPACES_TEST_STORAGE_TYPES_HPP
diff --git a/test_conformance/clcpp/api/CMakeLists.txt b/test_conformance/clcpp/api/CMakeLists.txt
deleted file mode 100644
index 30763d6f62..0000000000
--- a/test_conformance/clcpp/api/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_API)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/api/main.cpp b/test_conformance/clcpp/api/main.cpp
deleted file mode 100644
index 7652838420..0000000000
--- a/test_conformance/clcpp/api/main.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_spec_consts.hpp"
-#include "test_ctors_dtors.hpp"
-#include "test_ctors.hpp"
-#include "test_dtors.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/api/test_ctors.hpp b/test_conformance/clcpp/api/test_ctors.hpp
deleted file mode 100644
index ae0695ca69..0000000000
--- a/test_conformance/clcpp/api/test_ctors.hpp
+++ /dev/null
@@ -1,487 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_API_TEST_CTORS_HPP
-#define TEST_CONFORMANCE_CLCPP_API_TEST_CTORS_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-#include <numeric>
-
-#include "../common.hpp"
-
-// TEST 1
-// Verify that constructors are executed before any kernel is executed.
-// Verify that when present, multiple constructors are executed. The order between
-// constructors is undefined, but they should all execute.
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * kernel_test_ctors_executed =
-    "__kernel void test_ctors_executed(global uint *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-const char * kernel_test_ctors_executed_multiple_ctors =
-    "__kernel void test_ctors_executed_multiple_ctors(global uint *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * kernel_test_ctors_executed =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "using namespace cl;\n"
-    "struct ctor_test_class {\n"
-    // non-trivial ctor
-    "   ctor_test_class(int y) { x = y;};\n"
-    "   int x;\n"
-    "};\n"
-    // global scope program variable
-    "ctor_test_class global_var(int(0xbeefbeef));\n"
-    "__kernel void test_ctors_executed(global_ptr<uint[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   int result = 0;\n"
-    "   if(global_var.x != int(0xbeefbeef)) result = 1;\n"
-    "   output[gid] = result;\n"
-    "}\n"
-;
-const char * kernel_test_ctors_executed_multiple_ctors =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "#include <opencl_limits>\n"
-    "using namespace cl;\n"
-    "template<class T>\n"
-    "struct ctor_test_class {\n"
-    // non-trivial ctor
-    "   ctor_test_class(T y) { x = y;};\n"
-    "   T x;\n"
-    "};\n"
-    // global scope program variables
-    "ctor_test_class<int> global_var0(int(0xbeefbeef));\n"
-    "ctor_test_class<uint> global_var1(uint(0xbeefbeefU));\n"
-    "ctor_test_class<float> global_var2(float(FLT_MAX));\n"
-    "__kernel void test_ctors_executed_multiple_ctors(global_ptr<uint[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   int result = 0;\n"
-    "   if(global_var0.x != int(0xbeefbeef))   result = 1;\n"
-    "   if(global_var1.x != uint(0xbeefbeefU)) result = 1;\n"
-    "   if(global_var2.x != float(FLT_MAX))    result = 1;\n"
-    "   output[gid] = result;\n"
-    "}\n"
-;
-#endif
-
-int test_ctors_execution(cl_device_id device,
-                         cl_context context,
-                         cl_command_queue queue,
-                         int count,
-                         std::string kernel_name,
-                         const char * kernel_source)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(context, &program, &kernel, kernel_source, kernel_name);
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(context, &program, &kernel, kernel_source, kernel_name, "", false);
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(context, &program, &kernel, kernel_source, kernel_name);
-    RETURN_ON_ERROR(error)
-#endif
-
-    // host vector, size == count, output[0...count-1] == 1
-    std::vector<cl_uint> output(count, cl_uint(1));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uint) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_uint) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    error = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_uint) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    size_t sum = std::accumulate(output.begin(), output.end(), size_t(0));
-    if(sum != 0)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(error, "Test %s failed.", kernel_name.c_str());
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_global_scope_ctors_executed)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = test_ctors_execution(
-        device, context, queue, count,
-        "test_ctors_executed", kernel_test_ctors_executed
-    );
-    CHECK_ERROR(local_error);
-    error |= local_error;
-
-    local_error = test_ctors_execution(
-        device, context, queue, count,
-        "test_ctors_executed_multiple_ctors", kernel_test_ctors_executed_multiple_ctors
-    );
-    CHECK_ERROR(local_error);
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-// TEST 2
-// Verify that constructors are only executed once when multiple kernels from a program are executed.
-
-// How: The first kernel (test_ctors_executed_once_set) is run once. It changes values of program scope
-// variables, then the second kernel is run multiple times, each time verifying that global variables
-// have correct values (the second kernel should observe the values assigned by the first kernel, not
-// by the constructors).
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * program_test_ctors_executed_once =
-    "__kernel void test_ctors_executed_once_set()\n"
-    "{\n"
-    "}\n"
-    "__kernel void test_ctors_executed_once_read(global uint *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * program_test_ctors_executed_once =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "using namespace cl;\n"
-    // struct template
-    "template<class T>\n"
-    "struct ctor_test_class {\n"
-    // non-trivial ctor
-    "   ctor_test_class(T y) { x = y;};\n"
-    "   T x;\n"
-    "};\n"
-    // global scope program variables
-    "ctor_test_class<int> global_var0(int(0));\n"
-    "ctor_test_class<uint> global_var1(uint(0));\n"
-
-    "__kernel void test_ctors_executed_once_set()\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   if(gid == 0) {\n"
-    "       global_var0.x = int(0xbeefbeef);\n"
-    "       global_var1.x = uint(0xbeefbeefU);\n"
-    "   }\n"
-    "}\n\n"
-
-    "__kernel void test_ctors_executed_once_read(global_ptr<uint[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   int result = 0;\n"
-    "   if(global_var0.x != int(0xbeefbeef))   result = 1;\n"
-    "   if(global_var1.x != uint(0xbeefbeefU)) result = 1;\n"
-    "   output[gid] = result;\n"
-    "}\n"
-;
-#endif
-
-AUTO_TEST_CASE(test_global_scope_ctors_executed_once)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel_set_global_vars;
-    cl_kernel kernel_read_global_vars;
-
-    size_t dim = 1;
-    size_t work_size[1];
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel_set_global_vars,
-        program_test_ctors_executed_once, "test_ctors_executed_once_set"
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel_set_global_vars,
-        program_test_ctors_executed_once, "test_ctors_executed_once_set", "", false
-    );
-    RETURN_ON_ERROR(error)
-    // Get the second kernel
-    kernel_read_global_vars = clCreateKernel(program, "test_ctors_executed_once_read", &error);
-    RETURN_ON_CL_ERROR(error, "clCreateKernel");
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel_set_global_vars,
-        program_test_ctors_executed_once, "test_ctors_executed_once_set"
-    );
-    RETURN_ON_ERROR(error)
-    // Get the second kernel
-    kernel_read_global_vars = clCreateKernel(program, "test_ctors_executed_once_read", &error);
-    RETURN_ON_CL_ERROR(error, "clCreateKernel");
-#endif
-
-    // Execute kernel_set_global_vars
-
-    work_size[0] = count;
-    error = clEnqueueNDRangeKernel(queue, kernel_set_global_vars, dim, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    // Execute kernel_read_global_vars 4 times, each time we check if
-    // global variables have correct values.
-
-    // host vector, size == count, output[0...count-1] == 1
-    std::vector<cl_uint> output(count, cl_uint(1));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uint) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    for(size_t i = 0; i < 4; i++)
-    {
-        std::fill(output.begin(), output.end(), cl_uint(1));
-        error = clEnqueueWriteBuffer(
-            queue, output_buffer, CL_TRUE,
-            0, sizeof(cl_uint) * output.size(),
-            static_cast<void *>(output.data()),
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-        error = clSetKernelArg(kernel_read_global_vars, 0, sizeof(output_buffer), &output_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-        work_size[0] = output.size();
-        error = clEnqueueNDRangeKernel(
-            queue, kernel_read_global_vars,
-            dim, NULL, work_size, NULL,
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-        error = clEnqueueReadBuffer(
-            queue, output_buffer, CL_TRUE,
-            0, sizeof(cl_uint) * output.size(),
-            static_cast<void *>(output.data()),
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-        size_t sum = std::accumulate(output.begin(), output.end(), size_t(0));
-        if(sum != 0)
-        {
-            error = -1;
-            CHECK_ERROR_MSG(error, "Test test_ctors_executed_onces failed.");
-        }
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(kernel_set_global_vars);
-    clReleaseKernel(kernel_read_global_vars);
-    clReleaseProgram(program);
-    return error;
-}
-
-// TEST3
-// Verify that when constructor is executed, the ND-range used is (1,1,1).
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * program_test_ctors_ndrange =
-    "__kernel void test_ctors_ndrange(global int *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * program_test_ctors_ndrange =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "using namespace cl;\n"
-    // struct
-    "struct ctor_test_class {\n"
-    // non-trivial ctor
-    "   ctor_test_class() {\n"
-    "       x = get_global_size(0);\n"
-    "       y = get_global_size(1);\n"
-    "       z = get_global_size(2);\n"
-    "   };\n"
-    "   ulong x;\n"
-    "   ulong y;\n"
-    "   ulong z;\n"
-    // return true if the ND-range used when ctor was exectured was
-    // (1, 1, 1); otherwise - false
-    "   bool check() { return (x == 1) && (y == 1) && (z == 1);}"
-    "};\n"
-    // global scope program variables
-    "ctor_test_class global_var0;\n"
-    "ctor_test_class global_var1;\n"
-
-    "__kernel void test_ctors_ndrange(global_ptr<uint[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   int result = 0;\n"
-    "   if(!global_var0.check()) result = 1;\n"
-    "   if(!global_var1.check()) result = 1;\n"
-    "   output[gid] = result;\n"
-    "}\n"
-;
-#endif
-
-AUTO_TEST_CASE(test_global_scope_ctors_ndrange)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_ctors_ndrange, "test_ctors_ndrange"
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_ctors_ndrange, "test_ctors_ndrange", "", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_ctors_ndrange, "test_ctors_ndrange"
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    // host vector, size == count, output[0...count-1] == 1
-    std::vector<cl_uint> output(count, cl_uint(1));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uint) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(
-        queue, kernel,
-        dim, NULL, work_size, NULL,
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    error = clEnqueueReadBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    size_t sum = std::accumulate(output.begin(), output.end(), size_t(0));
-    if(sum != 0)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(error, "Test test_ctors_executed_ndrange failed.");
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_API_TEST_CTORS_HPP
diff --git a/test_conformance/clcpp/api/test_ctors_dtors.hpp b/test_conformance/clcpp/api/test_ctors_dtors.hpp
deleted file mode 100644
index 02838fa771..0000000000
--- a/test_conformance/clcpp/api/test_ctors_dtors.hpp
+++ /dev/null
@@ -1,185 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_API_TEST_CTORS_DTORS_HPP
-#define TEST_CONFORMANCE_CLCPP_API_TEST_CTORS_DTORS_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-#include "../common.hpp"
-
-// Verify queries clGetProgramInfo correctly return the presence of constructors and/or destructors
-// in the program (using option CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT/CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT)
-// (both are present, either one is present, none is present).
-
-std::string generate_ctor_dtor_program(const bool ctor, const bool dtor)
-{
-    std::string program;
-    if(ctor)
-    {
-        program +=
-            "struct ctor_test_class {\n"
-            // non-trivial ctor
-            "   ctor_test_class(int y) { x = y;};\n"
-            "   int x;\n"
-            "};\n"
-            "ctor_test_class ctor = ctor_test_class(1024);\n"
-        ;
-    }
-    if(dtor)
-    {
-        program +=
-            "struct dtor_test_class {\n"
-            // non-trivial dtor
-            "   ~dtor_test_class() { x = -1024; };\n"
-            "   int x;\n"
-            "};\n"
-            "dtor_test_class dtor;\n"
-        ;
-    }
-    program += "__kernel void test_ctor_dtor()\n {\n }\n";
-    return program;
-}
-
-int test_get_program_info_global_ctors_dtors_present(cl_device_id device,
-                                                     cl_context context,
-                                                     cl_command_queue queue,
-                                                     const bool ctor,
-                                                     const bool dtor)
-{
-    int error = CL_SUCCESS;
-    cl_program program;
-
-    // program source and options
-    std::string options = "";
-    std::string source = generate_ctor_dtor_program(ctor, dtor);
-    const char * source_ptr = source.c_str();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    // Create program
-    error = create_openclcpp_program(context, &program, 1, &source_ptr, options.c_str());
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    return CL_SUCCESS;
-// Normal run
-#else
-    // Create program
-    error = create_openclcpp_program(context, &program, 1, &source_ptr, options.c_str());
-    RETURN_ON_ERROR(error)
-#endif
-
-    // CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT cl_bool
-    // This indicates that the program object contains non-trivial constructor(s) that will be
-    // executed by runtime before any kernel from the program is executed.
-
-    // CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT cl_bool
-    // This indicates that the program object contains non-trivial destructor(s) that will be
-    // executed by runtime when program is destroyed.
-
-    // CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT
-    cl_bool ctors_present;
-    size_t cl_bool_size;
-    error = clGetProgramInfo(
-        program,
-        CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT,
-        sizeof(cl_bool),
-        static_cast<void*>(&ctors_present),
-        &cl_bool_size
-    );
-    RETURN_ON_CL_ERROR(error, "clGetProgramInfo")
-    if(cl_bool_size != sizeof(cl_bool))
-    {
-        error = -1;
-        CHECK_ERROR_MSG(-1, "Test failed, param_value_size_ret != sizeof(cl_bool) (%lu != %lu).", cl_bool_size, sizeof(cl_bool));
-    }
-    if(ctor && ctors_present != CL_TRUE)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(-1, "Test failed, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT: 0, should be: 1.");
-    }
-    else if(!ctor && ctors_present == CL_TRUE)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(-1, "Test failed, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT: 1, should be: 0.");
-    }
-
-    // CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT
-    cl_bool dtors_present = 0;
-    error = clGetProgramInfo(
-        program,
-        CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT,
-        sizeof(cl_bool),
-        static_cast<void*>(&ctors_present),
-        &cl_bool_size
-    );
-    RETURN_ON_CL_ERROR(error, "clGetProgramInfo")
-    if(cl_bool_size != sizeof(cl_bool))
-    {
-        error = -1;
-        CHECK_ERROR_MSG(-1, "Test failed, param_value_size_ret != sizeof(cl_bool) (%lu != %lu).", cl_bool_size, sizeof(cl_bool));
-    }
-    if(dtor && dtors_present != CL_TRUE)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(-1, "Test failed, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT: 0, should be: 1.");
-    }
-    else if(!dtor && dtors_present == CL_TRUE)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(-1, "Test failed, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT: 1, should be: 0.");
-    }
-
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_global_scope_ctors_dtors_present)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-    // both present
-    last_error = test_get_program_info_global_ctors_dtors_present(device, context, queue, true, true);
-    CHECK_ERROR(last_error);
-    error |= last_error;
-    // dtor
-    last_error = test_get_program_info_global_ctors_dtors_present(device, context, queue, false, true);
-    CHECK_ERROR(last_error);
-    error |= last_error;
-    // ctor
-    last_error = test_get_program_info_global_ctors_dtors_present(device, context, queue, true, false);
-    CHECK_ERROR(last_error);
-    error |= last_error;
-    // none present
-    last_error = test_get_program_info_global_ctors_dtors_present(device, context, queue, false, false);
-    CHECK_ERROR(last_error);
-    error |= last_error;
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_API_TEST_CTORS_DTORS_HPP
diff --git a/test_conformance/clcpp/api/test_dtors.hpp b/test_conformance/clcpp/api/test_dtors.hpp
deleted file mode 100644
index e04cbb1cdd..0000000000
--- a/test_conformance/clcpp/api/test_dtors.hpp
+++ /dev/null
@@ -1,559 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_API_TEST_DTORS_HPP
-#define TEST_CONFORMANCE_CLCPP_API_TEST_DTORS_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-#include <numeric>
-
-#include "../common.hpp"
-
-// TEST 1
-// Verify that destructor is executed.
-
-// How: destructor of struct dtor_test_class has a side effect: zeroing buffer. If values
-// in buffer are not zeros after releasing program, destructor was not executed.
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * program_test_dtor_is_executed =
-    "__kernel void test_dtor_is_executed(global uint *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * program_test_dtor_is_executed =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "using namespace cl;\n"
-    // struct
-    "struct dtor_test_class {\n"
-    // non-trivial dtor
-    // set all values in buffer to 0
-    "   ~dtor_test_class() {\n"
-    "       for(ulong i = 0; i < size; i++)\n"
-    "           buffer[i] = 0;\n"
-    "   };\n"
-    "   global_ptr<uint[]> buffer;\n"
-    "   ulong size;\n"
-    "};\n"
-    // global scope program variable
-    "dtor_test_class global_var;\n"
-
-    // values in output __MUST BE__ greater than 0 for the test to work
-    // correctly
-    "__kernel void test_dtor_is_executed(global_ptr<uint[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    // set buffer and size in global var
-    "   if(gid == 0){\n"
-    "       global_var.buffer = output;\n"
-    "       global_var.size = get_global_size(0);\n"
-    "   }\n"
-    "}\n"
-;
-#endif
-
-AUTO_TEST_CASE(test_global_scope_dtor_is_executed)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtor_is_executed, "test_dtor_is_executed"
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtor_is_executed, "test_dtor_is_executed", "", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtor_is_executed, "test_dtor_is_executed"
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    // host vector, size == count, output[0...count-1] == 0xbeefbeef (3203383023)
-    // values in output __MUST BE__ greater than 0 for the test to work correctly
-    std::vector<cl_uint> output(count, cl_uint(0xbeefbeef));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uint) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(
-        queue, kernel,
-        dim, NULL, work_size, NULL,
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    // Release kernel and program
-    // Dtor should be called now
-    error = clReleaseKernel(kernel);
-    RETURN_ON_CL_ERROR(error, "clReleaseKernel")
-    error = clReleaseProgram(program);
-    RETURN_ON_CL_ERROR(error, "clReleaseProgram")
-
-    // Finish
-    error = clFinish(queue);
-    RETURN_ON_CL_ERROR(error, "clFinish")
-
-    // Read output buffer
-    error = clEnqueueReadBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    size_t sum = std::accumulate(output.begin(), output.end(), size_t(0));
-    if(sum != 0)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(error, "Test test_dtor_is_executed failed.");
-    }
-
-    clReleaseMemObject(output_buffer);
-    return error;
-}
-
-// TEST 2
-// Verify that multiple destructors, if present, are executed. Order between multiple
-// destructors is undefined.
-// Verify that each destructor is executed only once.
-
-// How:
-// 0) dtor_test_class struct has a global pointer to a buffer, it's set by
-// test_dtors_executed_once kernel.
-// 1) Destructors have a side effect: each dtor writes to its part of the buffer. If all
-// dtors are executed, all values in that buffer should be changed.
-// 2) The first time destructors are executed, they set their parts of the buffer to zero.
-// Next time to 1, next time to 2 etc. Since dtors should be executed only once, all
-// values in that buffer should be equal to zero.
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * program_test_dtors_executed_once =
-    "__kernel void test_dtors_executed_once(global uint *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * program_test_dtors_executed_once =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "using namespace cl;\n"
-    // struct
-    "struct dtor_test_class {\n"
-    // non-trivial dtor
-    // Set all values in range [start; end - 1] in buffer to counter.
-    // If dtor is executed only once (correct), all values in range
-    // [start; end - 1] in buffer should be equal to zero after releasing
-    // the program
-    "   ~dtor_test_class() {\n"
-    "       for(ulong i = start; i < end; i++){\n"
-    "           buffer[i] = counter;\n"
-    "       };\n"
-    "       counter++;\n"
-    "   };\n"
-    "   global_ptr<uint[]> buffer;\n"
-    "   ulong start;\n"
-    "   ulong end;\n"
-    "   ulong counter;\n"
-    "};\n"
-    // global scope program variables
-    "dtor_test_class global_var0;\n"
-    "dtor_test_class global_var1;\n"
-    "dtor_test_class global_var2;\n"
-    "dtor_test_class global_var3;\n"
-
-    // values in output __MUST BE__ greater than 0 for the test to work correctly
-    "__kernel void test_dtors_executed_once(global_ptr<uint[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    // set buffer and size in global var
-    "   if(gid == 0){\n"
-    "       ulong end = get_global_size(0) / 4;"
-    // global_var0
-    "       global_var0.buffer = output;\n"
-    "       global_var0.start = 0;\n"
-    "       global_var0.end = end;\n"
-    "       global_var0.counter = 0;\n"
-    // global_var1
-    "       global_var1.buffer = output;\n"
-    "       global_var1.start = end;\n"
-    "       end += get_global_size(0) / 4;\n"
-    "       global_var1.end = end;\n"
-    "       global_var1.counter = 0;\n"
-    // global_var2
-    "       global_var2.buffer = output;\n"
-    "       global_var2.start = end;\n"
-    "       end += get_global_size(0) / 4;\n"
-    "       global_var2.end = end;\n"
-    "       global_var2.counter = 0;\n"
-    // global_var3
-    "       global_var3.buffer = output;\n"
-    "       global_var3.start = end;\n"
-    "       global_var3.end = get_global_size(0);\n"
-    "       global_var3.counter = 0;\n"
-    "   }\n"
-    "}\n"
-;
-#endif
-
-AUTO_TEST_CASE(test_global_scope_dtors_executed_once)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtors_executed_once, "test_dtors_executed_once"
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtors_executed_once, "test_dtors_executed_once", "", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtors_executed_once, "test_dtors_executed_once"
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    // host vector, size == count, output[0...count-1] == 0xbeefbeef (3203383023)
-    // values in output __MUST BE__ greater than 0 for the test to work correctly
-    cl_uint init_value = cl_uint(0xbeefbeef);
-    std::vector<cl_uint> output(count, init_value);
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uint) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(
-        queue, kernel,
-        dim, NULL, work_size, NULL,
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-
-    // Increments the program reference count. Twice
-    error = clRetainProgram(program);
-    RETURN_ON_CL_ERROR(error, "clRetainProgram")
-    error = clRetainProgram(program);
-    RETURN_ON_CL_ERROR(error, "clRetainProgram")
-
-    // Should just decrement the program reference count.
-    error = clReleaseProgram(program);
-    RETURN_ON_CL_ERROR(error, "clReleaseProgram")
-    error = clFinish(queue);
-    RETURN_ON_CL_ERROR(error, "clFinish")
-
-    // Should just decrement the program reference count.
-    error = clReleaseProgram(program);
-    RETURN_ON_CL_ERROR(error, "clReleaseProgram")
-    error = clFinish(queue);
-    RETURN_ON_CL_ERROR(error, "clFinish")
-
-#ifndef USE_OPENCLC_KERNELS
-    // At this point global scope variables should not be destroyed,
-    // values in output buffer should not be modified.
-
-    // Read output buffer
-    error = clEnqueueReadBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-    for(auto& i : output)
-    {
-        if(i != init_value)
-        {
-            log_error("ERROR: Test test_global_scope_dtors_executed_once failed.");
-            log_error("\tDestructors were executed prematurely.\n");
-            RETURN_ON_ERROR(-1)
-        }
-    }
-#endif
-
-    // Release kernel and program, destructors should be called now
-    error = clReleaseKernel(kernel);
-    RETURN_ON_CL_ERROR(error, "clReleaseKernel")
-    error = clReleaseProgram(program);
-    RETURN_ON_CL_ERROR(error, "clReleaseProgram")
-
-    // Finish
-    error = clFinish(queue);
-    RETURN_ON_CL_ERROR(error, "clFinish")
-
-    // Read output buffer
-    error = clEnqueueReadBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    size_t sum = std::accumulate(output.begin(), output.end(), size_t(0));
-    if(sum != 0)
-    {
-        log_error("ERROR: Test test_global_scope_dtors_executed_once failed.");
-        // Maybe some dtors were not run?
-        for(auto& i : output)
-        {
-            if(i == init_value)
-            {
-                log_error("\tSome dtors were not executed.");
-                break;
-            }
-        }
-        log_error("\n");
-        RETURN_ON_ERROR(-1)
-    }
-
-    // Clean
-    clReleaseMemObject(output_buffer);
-    return error;
-}
-
-// TEST3
-// Verify that ND-range during destructor execution is set to (1,1,1)
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * program_test_dtor_ndrange =
-    "__kernel void test_dtor_ndrange(global uint *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * program_test_dtor_ndrange =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "using namespace cl;\n"
-    // struct
-    "struct dtor_test_class {\n"
-    // non-trivial dtor
-    // set all values in buffer to 0 only if ND-range is (1, 1, 1)
-    "   ~dtor_test_class() {\n"
-    "       if(check()){\n"
-    "           for(ulong i = 0; i < size; i++)\n"
-    "               buffer[i] = 0;\n"
-    "       }\n"
-    "   };\n"
-    // return true if the ND-range is (1, 1, 1); otherwise - false
-    "   bool check() {\n"
-    "       return (get_global_size(0) == 1)"
-              " && (get_global_size(1) == 1)"
-              " && (get_global_size(2) == 1);\n"
-    "   }"
-    "   ulong size;\n"
-    "   global_ptr<uint[]> buffer;\n"
-    "};\n"
-    // global scope program variable
-    "dtor_test_class global_var;\n"
-
-    // values in output __MUST BE__ greater than 0 for the test to work correctly
-    "__kernel void test_dtor_ndrange(global_ptr<uint[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    // set buffer and size in global var
-    "   if(gid == 0){\n"
-    "       global_var.buffer = output;\n"
-    "       global_var.size = get_global_size(0);\n"
-    "   }\n"
-    "}\n"
-;
-#endif
-
-AUTO_TEST_CASE(test_global_scope_dtor_ndrange)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtor_ndrange, "test_dtor_ndrange"
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtor_ndrange, "test_dtor_ndrange", "", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        program_test_dtor_ndrange, "test_dtor_ndrange"
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    // host vector, size == count, output[0...count-1] == 0xbeefbeef (3203383023)
-    // values in output __MUST BE__ greater than 0 for the test to work correctly
-    std::vector<cl_uint> output(count, cl_uint(0xbeefbeef));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_uint) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(
-        queue, kernel,
-        dim, NULL, work_size, NULL,
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    // Release kernel and program
-    // Dtor should be called now
-    error = clReleaseKernel(kernel);
-    RETURN_ON_CL_ERROR(error, "clReleaseKernel")
-    error = clReleaseProgram(program);
-    RETURN_ON_CL_ERROR(error, "clReleaseProgram")
-
-    // Finish
-    error = clFinish(queue);
-    RETURN_ON_CL_ERROR(error, "clFinish")
-
-    // Read output buffer
-    error = clEnqueueReadBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    size_t sum = std::accumulate(output.begin(), output.end(), size_t(0));
-    if(sum != 0)
-    {
-        error = -1;
-        CHECK_ERROR_MSG(error, "Test test_dtor_ndrange failed.");
-    }
-
-    clReleaseMemObject(output_buffer);
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_API_TEST_DTORS_HPP
diff --git a/test_conformance/clcpp/api/test_spec_consts.hpp b/test_conformance/clcpp/api/test_spec_consts.hpp
deleted file mode 100644
index c403f4d9ee..0000000000
--- a/test_conformance/clcpp/api/test_spec_consts.hpp
+++ /dev/null
@@ -1,480 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_API_TEST_SPEC_CONSTS_HPP
-#define TEST_CONFORMANCE_CLCPP_API_TEST_SPEC_CONSTS_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-#include "../common.hpp"
-
-// TEST 1
-// Verify that if left unset the specialization constant defaults to the default value set in SPIR-V (zero).
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * kernel_test_spec_consts_defaults =
-    "__kernel void test_spec_consts_defaults(global int *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * kernel_test_spec_consts_defaults =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "#include <opencl_spec_constant>\n"
-    "using namespace cl;\n"
-    "spec_constant<char,  1> spec1(0);\n"
-    "spec_constant<uchar, 2> spec2(0);\n"
-    "spec_constant<short, 3> spec3(0);\n"
-    "spec_constant<ushort,4> spec4(0);\n"
-    "spec_constant<int,   5> spec5(0);\n"
-    "spec_constant<uint,  6> spec6(0);\n"
-    "spec_constant<long,  7> spec7(0);\n"
-    "spec_constant<ulong, 8> spec8(0);\n"
-    "spec_constant<float, 9> spec9(0.0f);\n"
-    "#ifdef cl_khr_fp64\n"
-    "spec_constant<double, 10> spec10(0.0);\n"
-    "#endif\n"
-    "#ifdef cl_khr_fp16\n"
-    "spec_constant<half, 11> spec11(0.0h);\n"
-    "#endif\n"
-    "__kernel void test_spec_consts_defaults(global_ptr<int[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   int result = 0;\n"
-    "   if(get(spec1) != char(0))   result = 1;\n"
-    "   if(get(spec2) != uchar(0))  result = 1;\n"
-    "   if(get(spec3) != short(0))  result = 1;\n"
-    "   if(get(spec4) != ushort(0)) result = 1;\n"
-    "   if(get(spec5) != int(0))    result = 1;\n"
-    "   if(get(spec6) != uint(0))   result = 1;\n"
-    "   if(get(spec7) != long(0))   result = 1;\n"
-    "   if(get(spec8) != ulong(0))  result = 1;\n"
-    "   if(get(spec9) != float(0))  result = 1;\n"
-    "#ifdef cl_khr_fp64\n"
-    "   if(get(spec10) != double(0)) result = 1;\n"
-    "#endif\n"
-    "#ifdef cl_khr_fp16\n"
-    "   if(get(spec11) != half(0)) result = 1;\n"
-    "#endif\n"
-    "   output[gid] = result;\n"
-    "}\n"
-;
-#endif
-
-AUTO_TEST_CASE(test_spec_consts_defaults)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-
-    std::string options = "";
-    if(is_extension_available(device, "cl_khr_fp16"))
-    {
-        options += " -cl-fp16-enable";
-    }
-    if(is_extension_available(device, "cl_khr_fp64"))
-    {
-        options += " -cl-fp64-enable";
-    }
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(context, &program, &kernel, kernel_test_spec_consts_defaults, "test_spec_consts_defaults", options);
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(context, &program, &kernel, kernel_test_spec_consts_defaults, "test_spec_consts_defaults", "", false);
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    // Spec constants are NOT set before clBuildProgram (called in create_opencl_kernel), so
-    // they all should default to the default value set in SPIR-V (zero).
-    error = create_opencl_kernel(context, &program, &kernel, kernel_test_spec_consts_defaults, "test_spec_consts_defaults", options);
-    RETURN_ON_ERROR(error)
-#endif
-
-    // host vector, size == 1, output[0] == 1
-    std::vector<cl_int> output(1, cl_int(1));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_int) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_int) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKerne")
-
-    error = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_int) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    // if output[0] != 0, then some spec constant(s) did not default to zero.
-    if(output[0] != 0)
-    {
-        RETURN_ON_ERROR_MSG(-1, "Test test_spec_consts_defaults failed, output[0]: %d.", output[0])
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-// TEST 2
-// Verify that setting an existing specialization constant affects only
-// the value of that constant and not of other specialization constants.
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * kernel_test_spec_consts_many_constants =
-    "__kernel void test_spec_consts_many_constants(global int *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * kernel_test_spec_consts_many_constants =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "#include <opencl_spec_constant>\n"
-    "using namespace cl;\n"
-    "spec_constant<int, 1> spec1(0);\n"
-    "spec_constant<int, 2> spec2(0);\n"
-    "spec_constant<int, 3> spec3(0);\n"
-    "__kernel void test_spec_consts_defaults(global_ptr<int[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   int result = 0;\n"
-    "   if(get(spec1) != int(-1024)) result += 1;\n"
-    "   if(get(spec2) != int(0))     result += 2;\n"
-    "   if(get(spec3) != int(1024))  result += 4;\n"
-    "   output[gid] = result;\n"
-    "}\n"
-;
-#endif
-
-AUTO_TEST_CASE(test_spec_consts_many_constants)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        kernel_test_spec_consts_many_constants, "test_spec_consts_many_constants"
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        kernel_test_spec_consts_many_constants, "test_spec_consts_many_constants", "", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    // Create program
-    error = create_openclcpp_program(context, &program, 1, &kernel_test_spec_consts_many_constants);
-    RETURN_ON_ERROR(error)
-
-    // Set specialization constants
-
-    // clSetProgramSpecializationConstant(
-    //     cl_program /* program */, cl_uint /* spec_id */, size_t  /* spec_size */,const void* /* spec_value */
-    // )
-    cl_int spec1 = -1024;
-    cl_int spec3 = 1024;
-    // Set spec1
-    error = clSetProgramSpecializationConstant(program, cl_uint(1), sizeof(cl_int), static_cast<void*>(&spec1));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Specialization constant spec2 should default to zero
-    // Set spec3
-    error = clSetProgramSpecializationConstant(program, cl_uint(3), sizeof(cl_int), static_cast<void*>(&spec3));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-
-    // Build program and create kernel
-    error = build_program_create_kernel_helper(
-        context, &program, &kernel, 1, &kernel_test_spec_consts_many_constants, "test_spec_consts_many_constants"
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    // host vector, size == 1, output[0] == 1
-    std::vector<cl_int> output(1, cl_int(1));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_int) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_int) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    error = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_int) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    // if output[0] != 0, then values of spec constants were incorrect
-    if(output[0] != 0)
-    {
-        RETURN_ON_ERROR_MSG(-1, "Test test_spec_consts_many_constants failed, output[0]: %d.", output[0]);
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-// TEST 3
-// Verify that the API correctly handles the size of a specialization constant by exercising
-// the API for specialization constants of different types (int, bool, float, etc.)
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const char * kernel_test_spec_consts_different_types =
-    "__kernel void test_spec_consts_different_types(global int *output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   output[gid] = 0;\n"
-    "}\n"
-;
-#else
-const char * kernel_test_spec_consts_different_types =
-    "#include <opencl_memory>\n"
-    "#include <opencl_work_item>\n"
-    "#include <opencl_spec_constant>\n"
-    "#include <opencl_limits>\n"
-    "using namespace cl;\n"
-    "spec_constant<char,  1> spec1(0);\n"
-    "spec_constant<uchar, 2> spec2(0);\n"
-    "spec_constant<short, 3> spec3(0);\n"
-    "spec_constant<ushort,4> spec4(0);\n"
-    "spec_constant<int,   5> spec5(0);\n"
-    "spec_constant<uint,  6> spec6(0);\n"
-    "spec_constant<long,  7> spec7(0);\n"
-    "spec_constant<ulong, 8> spec8(0);\n"
-    "spec_constant<float, 9> spec9(0.0f);\n"
-    "#ifdef cl_khr_fp64\n"
-    "spec_constant<double, 10> spec10(0.0);\n"
-    "#endif\n"
-    "#ifdef cl_khr_fp16\n"
-    "spec_constant<half, 11> spec11(0.0h);\n"
-    "#endif\n"
-    "__kernel void test_spec_consts_different_types(global_ptr<int[]> output)\n"
-    "{\n"
-    "   ulong gid = get_global_id(0);\n"
-    "   int result = 0;\n"
-    "   if(get(spec1) != char(CHAR_MAX))    result += 1;\n"
-    "   if(get(spec2) != uchar(UCHAR_MAX))  result += 2;\n"
-    "   if(get(spec3) != short(SHRT_MAX))   result += 4;\n"
-    "   if(get(spec4) != ushort(USHRT_MAX)) result += 8;\n"
-    "   if(get(spec5) != int(INT_MAX))      result += 16;\n"
-    "   if(get(spec6) != uint(UINT_MAX))    result += 32;\n"
-    "   if(get(spec7) != long(LONG_MAX))    result += 64;\n"
-    "   if(get(spec8) != ulong(ULONG_MAX))  result += 128;\n"
-    "   if(get(spec9) != float(FLT_MAX))    result += 256;\n"
-    "#ifdef cl_khr_fp64\n"
-    "   if(get(spec10) != double(DBL_MAX)) result += 512;\n"
-    "#endif\n"
-    "#ifdef cl_khr_fp16\n"
-    "   if(get(spec11) != half(HALF_MAX)) result += 1024;\n"
-    "#endif\n"
-    "   output[gid] = result;\n"
-    "}\n"
-;
-#endif
-
-
-AUTO_TEST_CASE(test_spec_consts_different_types)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-
-    cl_mem output_buffer;
-    cl_program program;
-    cl_kernel kernel;
-
-    size_t dim = 1;
-    size_t work_size[1];
-
-    std::string options = "";
-    if(is_extension_available(device, "cl_khr_fp16"))
-    {
-        options += " -cl-fp16-enable";
-    }
-    if(is_extension_available(device, "cl_khr_fp64"))
-    {
-        options += " -cl-fp64-enable";
-    }
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(context, &program, &kernel, kernel_test_spec_consts_different_types, "test_spec_consts_different_types", options);
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(context, &program, &kernel, kernel_test_spec_consts_different_types, "test_spec_consts_different_types", "", false);
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    // Create program
-    error = create_openclcpp_program(context, &program, 1, &kernel_test_spec_consts_different_types, options.c_str());
-    RETURN_ON_ERROR(error)
-
-    // Set specialization constants
-    cl_uint spec_id = 1;
-
-    cl_char   spec1 = CL_CHAR_MAX;
-    cl_uchar  spec2 = CL_UCHAR_MAX;
-    cl_short  spec3 = CL_SHRT_MAX;
-    cl_ushort spec4 = CL_USHRT_MAX;
-    cl_int    spec5 = CL_INT_MAX;
-    cl_uint   spec6 = CL_UINT_MAX;
-    cl_long   spec7 = CL_LONG_MAX;
-    cl_ulong  spec8 = CL_ULONG_MAX;
-    cl_float  spec9 = CL_FLT_MAX;
-    cl_double spec10 = CL_DBL_MAX;
-    cl_half   spec11 = CL_HALF_MAX;
-
-    // Set spec1
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_char), static_cast<void*>(&spec1));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec2
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_uchar), static_cast<void*>(&spec2));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec3
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_short), static_cast<void*>(&spec3));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec4
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_ushort), static_cast<void*>(&spec4));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec5
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_int), static_cast<void*>(&spec5));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec6
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_uint), static_cast<void*>(&spec6));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec7
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_long), static_cast<void*>(&spec7));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec8
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_ulong), static_cast<void*>(&spec8));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec9
-    error = clSetProgramSpecializationConstant(program, spec_id++, sizeof(cl_float), static_cast<void*>(&spec9));
-    RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    // Set spec10
-    if(is_extension_available(device, "cl_khr_fp64"))
-    {
-        error = clSetProgramSpecializationConstant(program, cl_uint(10), sizeof(cl_double), static_cast<void*>(&spec10));
-        RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    }
-    // Set spec11
-    if(is_extension_available(device, "cl_khr_fp16"))
-    {
-        error = clSetProgramSpecializationConstant(program, cl_uint(11), sizeof(cl_half), static_cast<void*>(&spec11));
-        RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    }
-
-    // Build program and create kernel
-    error = build_program_create_kernel_helper(
-        context, &program, &kernel, 1, &kernel_test_spec_consts_many_constants, "test_spec_consts_many_constants"
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    // Copy output to output_buffer, run kernel, copy output_buffer back to output, check result
-
-    // host vector, size == 1, output[0] == 1
-    std::vector<cl_int> output(1, cl_int(1));
-    output_buffer =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(cl_int) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_int) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = output.size();
-    error = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    error = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(cl_int) * output.size(), static_cast<void *>(output.data()), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    // if output[0] != 0, then some spec constants had incorrect values
-    if(output[0] != 0)
-    {
-        RETURN_ON_ERROR_MSG(-1, "Test test_spec_consts_different_types failed, output[0]: %d.", output[0])
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_API_TEST_SPEC_CONSTS_HPP
diff --git a/test_conformance/clcpp/atomics/CMakeLists.txt b/test_conformance/clcpp/atomics/CMakeLists.txt
deleted file mode 100644
index 4fb4bfd198..0000000000
--- a/test_conformance/clcpp/atomics/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_ATOMICS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/atomics/atomic_fetch.hpp b/test_conformance/clcpp/atomics/atomic_fetch.hpp
deleted file mode 100644
index 39a9948879..0000000000
--- a/test_conformance/clcpp/atomics/atomic_fetch.hpp
+++ /dev/null
@@ -1,308 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_ATOMICS_ATOMIC_FETCH_HPP
-#define TEST_CONFORMANCE_CLCPP_ATOMICS_ATOMIC_FETCH_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-
-const size_t atomic_bucket_size = 100;
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class type>
-std::string generate_kernel_atomic_fetch(func_type func)
-{
-    std::string in1_value = "input[gid]";
-    std::string out1_value = "output[gid / " + std::to_string(atomic_bucket_size) + "]";
-    std::string function_call = "atomic_" + func.str() + "(&" + out1_value + ", " + in1_value + ")";
-    return
-        "" + func.defs() +
-        "__kernel void test_" + func.str() + "(global " + type_name<type>() + " *input, global atomic_" + type_name<type>() + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    " + function_call + ";\n"
-        "}\n";
-}
-#else
-template <class func_type, class type>
-std::string generate_kernel_atomic_fetch(func_type func)
-{
-    std::string in1_value = "input[gid]";
-    std::string out1_value = "output[gid / " + std::to_string(atomic_bucket_size) + "]";
-    std::string function_call = func.str() + "(" + in1_value + ")";
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_" + func.str() + "(global_ptr<" + type_name<type>() +  "[]> input,"
-                                              "global_ptr<atomic<" + type_name<type>() + ">[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    " + out1_value + "." + function_call + ";\n"
-        "}\n";
-}
-#endif
-
-template<class TYPE, class atomic_fetch>
-bool verify_atomic_fetch(const std::vector<TYPE> &in, const std::vector<TYPE> &out, atomic_fetch op)
-{
-    for (size_t i = 0; i < out.size(); i++)
-    {
-        TYPE expected = op.init_out();
-        for (size_t k = 0; k < atomic_bucket_size; k++)
-        {
-            const size_t in_i = i * atomic_bucket_size + k;
-            if (in_i >= in.size())
-                break;
-            expected = op(expected, in[in_i]);
-        }
-        if (expected != out[i])
-        {
-            print_error_msg(expected, out[i], i, op);
-            return false;
-        }
-    }
-    return true;
-}
-
-template <class atomic_fetch>
-int test_atomic_fetch_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, atomic_fetch op)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int err;
-
-    typedef typename atomic_fetch::in_type TYPE;
-
-    // Don't run test for unsupported types
-    if (!(type_supported<TYPE>(device)))
-    {
-        return CL_SUCCESS;
-    }
-    if (sizeof(TYPE) == 8 &&
-        (!is_extension_available(device, "cl_khr_int64_base_atomics") ||
-         !is_extension_available(device, "cl_khr_int64_extended_atomics")))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_atomic_fetch<atomic_fetch, TYPE>(op);
-    std::string kernel_name("test_"); kernel_name += op.str();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-#endif
-
-    std::vector<TYPE> input = generate_input<TYPE>(count, op.min1(), op.max1(), std::vector<TYPE>());
-    std::vector<TYPE> output = generate_output<TYPE>((count - 1) / atomic_bucket_size + 1);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(TYPE) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(TYPE) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer")
-
-    const TYPE pattern = op.init_out();
-    err = clEnqueueFillBuffer(queue, buffers[1], &pattern, sizeof(pattern), 0, sizeof(TYPE) * output.size(), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueFillBuffer")
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg")
-    err = clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg")
-
-    work_size[0] = count;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer")
-
-    if (!verify_atomic_fetch(input, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1, "test_%s %s failed", op.str().c_str(), type_name<TYPE>().c_str());
-    }
-    log_info("test_%s %s passed\n", op.str().c_str(), type_name<TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-
-template<class TYPE>
-struct atomic_fetch
-{
-    typedef TYPE in_type;
-
-    std::string decl_str()
-    {
-        return type_name<TYPE>();
-    }
-
-    std::string defs()
-    {
-        std::string defs;
-        if (sizeof(TYPE) == 8)
-        {
-            defs += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
-            defs += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
-        }
-        return defs;
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_atomic>\n";
-    }
-
-    TYPE min1()
-    {
-        return 0;
-    }
-
-    TYPE max1()
-    {
-        return 1000;
-    }
-};
-
-
-#define DEF_ATOMIC_FETCH_FUNC(CLASS_NAME, FUNC_NAME, HOST_FUNC_EXPRESSION, INIT_OUT) \
-template<class TYPE> \
-struct CLASS_NAME : public atomic_fetch<TYPE> \
-{ \
-    std::string str() \
-    { \
-        return #FUNC_NAME; \
-    } \
-    \
-    TYPE init_out() \
-    { \
-        return INIT_OUT; \
-    } \
-    \
-    TYPE operator()(const TYPE& x, const TYPE& y) \
-    { \
-        return HOST_FUNC_EXPRESSION; \
-    } \
-};
-
-DEF_ATOMIC_FETCH_FUNC(atomic_fetch_add, fetch_add, x + y, 0)
-DEF_ATOMIC_FETCH_FUNC(atomic_fetch_sub, fetch_sub, x - y, (std::numeric_limits<TYPE>::max)())
-
-DEF_ATOMIC_FETCH_FUNC(atomic_fetch_and, fetch_and, x & y, (std::numeric_limits<TYPE>::max)())
-DEF_ATOMIC_FETCH_FUNC(atomic_fetch_or,  fetch_or,  x | y, 0)
-DEF_ATOMIC_FETCH_FUNC(atomic_fetch_xor, fetch_xor, x ^ y, 0)
-
-DEF_ATOMIC_FETCH_FUNC(atomic_fetch_max, fetch_max, (std::max)(x, y), 0)
-DEF_ATOMIC_FETCH_FUNC(atomic_fetch_min, fetch_min, (std::min)(x, y), (std::numeric_limits<TYPE>::max)())
-
-#undef DEF_ATOMIC_FETCH_FUNC
-
-
-AUTO_TEST_CASE(test_atomic_fetch)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-#define TEST_ATOMIC_MACRO(TEST_CLASS) \
-    last_error = test_atomic_fetch_func( \
-        device, context, queue, n_elems, TEST_CLASS \
-    ); \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-    TEST_ATOMIC_MACRO((atomic_fetch_add<cl_int>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_add<cl_uint>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_add<cl_long>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_add<cl_ulong>()))
-
-    TEST_ATOMIC_MACRO((atomic_fetch_sub<cl_int>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_sub<cl_uint>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_sub<cl_long>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_sub<cl_ulong>()))
-
-    TEST_ATOMIC_MACRO((atomic_fetch_and<cl_int>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_and<cl_uint>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_and<cl_long>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_and<cl_ulong>()))
-
-    TEST_ATOMIC_MACRO((atomic_fetch_or<cl_int>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_or<cl_uint>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_or<cl_long>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_or<cl_ulong>()))
-
-    TEST_ATOMIC_MACRO((atomic_fetch_xor<cl_int>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_xor<cl_uint>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_xor<cl_long>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_xor<cl_ulong>()))
-
-    TEST_ATOMIC_MACRO((atomic_fetch_max<cl_int>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_max<cl_uint>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_max<cl_long>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_max<cl_ulong>()))
-
-    TEST_ATOMIC_MACRO((atomic_fetch_min<cl_int>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_min<cl_uint>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_min<cl_long>()))
-    TEST_ATOMIC_MACRO((atomic_fetch_min<cl_ulong>()))
-
-#undef TEST_ATOMIC_MACRO
-
-    if (error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_ATOMICS_ATOMIC_FETCH_HPP
diff --git a/test_conformance/clcpp/atomics/main.cpp b/test_conformance/clcpp/atomics/main.cpp
deleted file mode 100644
index b9f964fa33..0000000000
--- a/test_conformance/clcpp/atomics/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "atomic_fetch.hpp"
-
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/attributes/CMakeLists.txt b/test_conformance/clcpp/attributes/CMakeLists.txt
deleted file mode 100644
index 1b1c15aa4e..0000000000
--- a/test_conformance/clcpp/attributes/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_ATTRIBUTES)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/attributes/main.cpp b/test_conformance/clcpp/attributes/main.cpp
deleted file mode 100644
index e731c00161..0000000000
--- a/test_conformance/clcpp/attributes/main.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_ivdep.hpp"
-#include "test_max_size.hpp"
-#include "test_required_num_sub_groups.hpp"
-
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/attributes/test_ivdep.hpp b/test_conformance/clcpp/attributes/test_ivdep.hpp
deleted file mode 100644
index 17b1f586ee..0000000000
--- a/test_conformance/clcpp/attributes/test_ivdep.hpp
+++ /dev/null
@@ -1,418 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_IVDEP_HPP
-#define TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_IVDEP_HPP
-
-#include <sstream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_ivdep {
-
-enum class loop_kind
-{
-    for_loop,
-    while_loop,
-    do_loop
-};
-
-struct test_options
-{
-    loop_kind loop;
-    int ivdep_length;
-    int offset1;
-    int offset2;
-    int iter_count;
-    bool offset1_param;
-    bool offset2_param;
-    bool iter_count_param;
-    bool cond_in_header;
-    bool init_in_header;
-    bool incr_in_header;
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_source(test_options options)
-{
-    std::string offset1s = options.offset1_param ? "offset1" : std::to_string(options.offset1);
-    std::string offset2s = options.offset2_param ? "offset2" : std::to_string(options.offset2);
-
-    std::string init = "i = 0";
-    std::string cond = std::string("i < ") + (options.iter_count_param ? "iter_count" : std::to_string(options.iter_count));
-    std::string incr = "i += 2";
-
-    std::stringstream s;
-    s << R"(
-    kernel void test(global int *a, global int *b, global int *c, int offset1, int offset2, int iter_count)
-    {
-        int i;
-    )";
-
-    // Loop #1
-    if (!options.init_in_header) s << init << ";" << std::endl;
-    if (options.loop == loop_kind::for_loop)
-        s << "for (" <<
-            (options.init_in_header ? init : "") << ";" <<
-            (options.cond_in_header ? cond : "") << ";" <<
-            (options.incr_in_header ? incr : "") << ")";
-    else if (options.loop == loop_kind::while_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ")";
-    else if (options.loop == loop_kind::do_loop)
-        s << "do";
-    s << "{" << std::endl;
-    if (!options.cond_in_header) s << "if (!(" << cond << ")) break;" << std::endl;
-    s << "a[i + " << offset1s << "] = b[i + " << offset1s << "] * c[i + " << offset1s << "];" << std::endl;
-    if (!options.incr_in_header) s << incr << ";" << std::endl;
-    s << "}" << std::endl;
-    if (options.loop == loop_kind::do_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ");" << std::endl;
-
-    // Loop #2
-    if (!options.init_in_header) s << init << ";" << std::endl;
-    if (options.loop == loop_kind::for_loop)
-        s << "for (" <<
-            (options.init_in_header ? init : "") << ";" <<
-            (options.cond_in_header ? cond : "") << ";" <<
-            (options.incr_in_header ? incr : "") << ")";
-    else if (options.loop == loop_kind::while_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ")";
-    else if (options.loop == loop_kind::do_loop)
-        s << "do";
-    s << "{" << std::endl;
-    if (!options.cond_in_header) s << "if (!(" << cond << ")) break;" << std::endl;
-    s << "a[i + " << offset2s << "] = a[i] + b[i];" << std::endl;
-    if (!options.incr_in_header) s << incr << ";" << std::endl;
-    s << "}" << std::endl;
-    if (options.loop == loop_kind::do_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ");" << std::endl;
-
-    s << "}" << std::endl;
-
-    return s.str();
-}
-#else
-std::string generate_source(test_options options)
-{
-    std::string offset1s = options.offset1_param ? "offset1" : std::to_string(options.offset1);
-    std::string offset2s = options.offset2_param ? "offset2" : std::to_string(options.offset2);
-
-    std::string init = "i = 0";
-    std::string cond = std::string("i < ") + (options.iter_count_param ? "iter_count" : std::to_string(options.iter_count));
-    std::string incr = "i += 2";
-
-    std::stringstream s;
-    s << R"(
-    #include <opencl_memory>
-    #include <opencl_work_item>
-
-    using namespace cl;
-    )";
-    s << R"(
-    kernel void test(global_ptr<int[]> a, global_ptr<int[]> b, global_ptr<int[]> c, int offset1, int offset2, int iter_count)
-    {
-        int i;
-    )";
-
-    // Loop #1
-    if (!options.init_in_header) s << init << ";" << std::endl;
-    if (options.ivdep_length > 0) s << "[[cl::ivdep]]" << std::endl;
-    if (options.loop == loop_kind::for_loop)
-        s << "for (" <<
-            (options.init_in_header ? init : "") << ";" <<
-            (options.cond_in_header ? cond : "") << ";" <<
-            (options.incr_in_header ? incr : "") << ")";
-    else if (options.loop == loop_kind::while_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ")";
-    else if (options.loop == loop_kind::do_loop)
-        s << "do";
-    s << "{" << std::endl;
-    if (!options.cond_in_header) s << "if (!(" << cond << ")) break;" << std::endl;
-    s << "a[i + " << offset1s << "] = b[i + " << offset1s << "] * c[i + " << offset1s << "];" << std::endl;
-    if (!options.incr_in_header) s << incr << ";" << std::endl;
-    s << "}" << std::endl;
-    if (options.loop == loop_kind::do_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ");" << std::endl;
-
-    // Loop #2
-    if (!options.init_in_header) s << init << ";" << std::endl;
-    if (options.ivdep_length > 0) s << "[[cl::ivdep(" << options.ivdep_length << ")]]" << std::endl;
-    if (options.loop == loop_kind::for_loop)
-        s << "for (" <<
-            (options.init_in_header ? init : "") << ";" <<
-            (options.cond_in_header ? cond : "") << ";" <<
-            (options.incr_in_header ? incr : "") << ")";
-    else if (options.loop == loop_kind::while_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ")";
-    else if (options.loop == loop_kind::do_loop)
-        s << "do";
-    s << "{" << std::endl;
-    if (!options.cond_in_header) s << "if (!(" << cond << ")) break;" << std::endl;
-    s << "a[i + " << offset2s << "] = a[i] + b[i];" << std::endl;
-    if (!options.incr_in_header) s << incr << ";" << std::endl;
-    s << "}" << std::endl;
-    if (options.loop == loop_kind::do_loop)
-        s << "while (" << (options.cond_in_header ? cond : "true") << ");" << std::endl;
-
-    s << "}" << std::endl;
-
-    return s.str();
-}
-#endif
-
-int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-    cl_program program;
-    cl_kernel kernel;
-
-    std::string kernel_name = "test";
-    std::string source = generate_source(options);
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name, "", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    const size_t count = 100;
-    const size_t global_size = 1;
-
-    std::vector<int> a(count);
-    std::vector<int> b(count);
-    std::vector<int> c(count);
-    for (size_t i = 0; i < count; i++)
-    {
-        a[i] = 0;
-        b[i] = i;
-        c[i] = 1;
-    }
-
-    cl_mem a_buffer;
-    a_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-        sizeof(int) * count, static_cast<void *>(a.data()), &error
-    );
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    cl_mem b_buffer;
-    b_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-        sizeof(int) * count, static_cast<void *>(b.data()), &error
-    );
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    cl_mem c_buffer;
-    c_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-        sizeof(int) * count, static_cast<void *>(c.data()),&error
-    );
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 3, sizeof(cl_int), &options.offset1);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 4, sizeof(cl_int), &options.offset2);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 5, sizeof(cl_int), &options.iter_count);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    std::vector<int> a_output(count);
-    error = clEnqueueReadBuffer(
-        queue, a_buffer, CL_TRUE,
-        0, sizeof(int) * count,
-        static_cast<void *>(a_output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    for (int i = 0; i < options.iter_count; i += 2)
-    {
-        a[i + options.offset1] = b[i + options.offset1] * c[i + options.offset1];
-    }
-
-    for (int i = 0; i < options.iter_count; i += 2)
-    {
-        a[i + options.offset2] = a[i] + b[i];
-    }
-
-    for (size_t i = 0; i < count; i++)
-    {
-        const int value = a_output[i];
-        const int expected = a[i];
-        if (value != expected)
-        {
-            RETURN_ON_ERROR_MSG(-1,
-                "Test failed. Element %lu: %d should be: %d",
-                i, value, expected
-            );
-        }
-    }
-
-    clReleaseMemObject(a_buffer);
-    clReleaseMemObject(b_buffer);
-    clReleaseMemObject(c_buffer);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-const std::vector<std::tuple<int, int, int>> params{
-    std::make_tuple<int, int, int>( -1, 0, 0 ),
-    std::make_tuple<int, int, int>( -1, 3, 4 ),
-    std::make_tuple<int, int, int>( 1, 1, 1 ),
-    std::make_tuple<int, int, int>( 3, 4, 2 ),
-    std::make_tuple<int, int, int>( 3, 4, 3 ),
-    std::make_tuple<int, int, int>( 8, 10, 7 ),
-    std::make_tuple<int, int, int>( 16, 16, 16 )
-};
-const std::vector<int> iter_counts{ { 1, 4, 12, 40 } };
-
-AUTO_TEST_CASE(test_ivdep_for)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error = CL_SUCCESS;
-
-    for (auto param : params)
-    for (auto iter_count : iter_counts)
-    for (bool offset1_param : { false, true })
-    for (bool offset2_param : { false, true })
-    for (bool iter_count_param : { false, true })
-    for (bool cond_in_header : { false, true })
-    for (bool init_in_header : { false, true })
-    for (bool incr_in_header : { false, true })
-    {
-        test_options options;
-        options.loop = loop_kind::for_loop;
-        options.ivdep_length = std::get<0>(param);
-        options.offset1 = std::get<1>(param);
-        options.offset2 = std::get<2>(param);
-        options.iter_count = iter_count;
-        options.offset1_param = offset1_param;
-        options.offset2_param = offset2_param;
-        options.iter_count_param = iter_count_param;
-        options.cond_in_header = cond_in_header;
-        options.init_in_header = init_in_header;
-        options.incr_in_header = incr_in_header;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-AUTO_TEST_CASE(test_ivdep_while)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error = CL_SUCCESS;
-
-    for (auto param : params)
-    for (auto iter_count : iter_counts)
-    for (bool offset1_param : { false, true })
-    for (bool offset2_param : { false, true })
-    for (bool iter_count_param : { false, true })
-    for (bool cond_in_header : { false, true })
-    {
-        test_options options;
-        options.loop = loop_kind::while_loop;
-        options.ivdep_length = std::get<0>(param);
-        options.offset1 = std::get<1>(param);
-        options.offset2 = std::get<2>(param);
-        options.iter_count = iter_count;
-        options.offset1_param = offset1_param;
-        options.offset2_param = offset2_param;
-        options.iter_count_param = iter_count_param;
-        options.cond_in_header = cond_in_header;
-        options.init_in_header = false;
-        options.incr_in_header = false;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-AUTO_TEST_CASE(test_ivdep_do)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error = CL_SUCCESS;
-
-    for (auto param : params)
-    for (auto iter_count : iter_counts)
-    for (bool offset1_param : { false, true })
-    for (bool offset2_param : { false, true })
-    for (bool iter_count_param : { false, true })
-    for (bool cond_in_header : { false, true })
-    {
-        test_options options;
-        options.loop = loop_kind::do_loop;
-        options.ivdep_length = std::get<0>(param);
-        options.offset1 = std::get<1>(param);
-        options.offset2 = std::get<2>(param);
-        options.iter_count = iter_count;
-        options.offset1_param = offset1_param;
-        options.offset2_param = offset2_param;
-        options.iter_count_param = iter_count_param;
-        options.cond_in_header = cond_in_header;
-        options.init_in_header = false;
-        options.incr_in_header = false;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_IVDEP_HPP
diff --git a/test_conformance/clcpp/attributes/test_max_size.hpp b/test_conformance/clcpp/attributes/test_max_size.hpp
deleted file mode 100644
index 15e7ead6bd..0000000000
--- a/test_conformance/clcpp/attributes/test_max_size.hpp
+++ /dev/null
@@ -1,266 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_MAX_SIZE_HPP
-#define TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_MAX_SIZE_HPP
-
-#include <sstream>
-#include <string>
-#include <vector>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_max_size {
-
-enum class address_space
-{
-    constant,
-    local
-};
-
-enum class param_kind
-{
-    ptr_type, // constant_ptr<T>
-    ptr,      // constant<T>*
-    ref       // constant<T>&
-};
-
-const param_kind param_kinds[] =
-{
-    param_kind::ptr_type,
-    param_kind::ptr,
-    param_kind::ref
-};
-
-struct test_options
-{
-    address_space space;
-    int max_size;
-    bool spec_const;
-    param_kind kind;
-    bool array;
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << "kernel void test(";
-    s << (options.space == address_space::constant ? "constant" : "local");
-    s << " int2 *input) { }" << std::endl;
-
-    return s.str();
-}
-#else
-std::string generate_source(test_options options)
-{
-    std::string type_str = "int2";
-    if (options.array)
-        type_str += "[]";
-
-    std::stringstream s;
-    s << "#include <opencl_memory>" << std::endl;
-
-    if (options.spec_const)
-    {
-        s << "#include <opencl_spec_constant>" << std::endl;
-        s << "cl::spec_constant<int, 1> max_size_spec{ 1234567890 };" << std::endl;
-    }
-
-    s << "kernel void test(";
-    s << "[[cl::max_size(" << (options.spec_const ? "max_size_spec" : std::to_string(options.max_size)) << ")]] ";
-    s << (options.space == address_space::constant ? "cl::constant" : "cl::local");
-    if (options.kind == param_kind::ptr_type)
-        s << "_ptr<" << type_str << ">";
-    else if (options.kind == param_kind::ptr)
-        s << "<" << type_str << ">*";
-    else if (options.kind == param_kind::ref)
-        s << "<" << type_str << ">&";
-    s << " input) { }" << std::endl;
-
-    return s.str();
-}
-#endif
-
-int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-    cl_program program;
-    cl_kernel kernel;
-
-    std::string kernel_name = "test";
-    std::string source = generate_source(options);
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name, "", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    const char *source_c_str = source.c_str();
-    error = create_openclcpp_program(context, &program, 1, &source_c_str, "");
-    RETURN_ON_ERROR(error)
-
-    if (options.spec_const)
-    {
-        error = clSetProgramSpecializationConstant(program, 1, sizeof(cl_int), &options.max_size);
-        RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    }
-
-    error = build_program_create_kernel_helper(
-        context, &program, &kernel, 1, &source_c_str, kernel_name.c_str()
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    const int max_size = options.max_size;
-    const int sizes[] = {
-        1,
-        max_size / 2,
-        max_size,
-        max_size + 1,
-        max_size * 2
-    };
-
-    for (int size : sizes)
-    {
-        cl_mem const_buffer;
-        if (options.space == address_space::constant)
-        {
-            const_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &error);
-            RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-            error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &const_buffer);
-            // Check the status later (depending on size and max_size values)
-        }
-        else if (options.space == address_space::local)
-        {
-            error = clSetKernelArg(kernel, 0, size, NULL);
-            // Check the status later (depending on size and max_size values)
-        }
-
-        if (size <= max_size)
-        {
-            // Correct value, must not fail
-            RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-            const size_t global_size = 123;
-            error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
-            RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-            error = clFinish(queue);
-            RETURN_ON_CL_ERROR(error, "clFinish")
-        }
-        else
-        {
-            // Incorrect value, must fail
-            if (error != CL_MAX_SIZE_RESTRICTION_EXCEEDED)
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "clSetKernelArg must fail with CL_MAX_SIZE_RESTRICTION_EXCEEDED,"
-                    " but returned %s (%d)", get_cl_error_string(error).c_str(), error
-                );
-            }
-        }
-
-        if (options.space == address_space::constant)
-        {
-            error = clReleaseMemObject(const_buffer);
-            RETURN_ON_CL_ERROR(error, "clReleaseMemObject")
-        }
-    }
-
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_max_size_constant)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error = CL_SUCCESS;
-
-    cl_ulong max_size;
-    error = clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_size), &max_size, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    for (bool spec_const : { false, true })
-    for (auto kind : param_kinds)
-    for (bool array : { false, true })
-    {
-        test_options options;
-        options.space = address_space::constant;
-        options.max_size = max_size / 2;
-        options.spec_const = spec_const;
-        options.kind = kind;
-        options.array = array;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-AUTO_TEST_CASE(test_max_size_local)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error = CL_SUCCESS;
-
-    cl_ulong max_size;
-    error = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(max_size), &max_size, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    for (bool spec_const : { false, true })
-    for (auto kind : param_kinds)
-    for (bool array : { false, true })
-    {
-        test_options options;
-        options.space = address_space::local;
-        options.max_size = max_size / 2;
-        options.spec_const = spec_const;
-        options.kind = kind;
-        options.array = array;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_MAX_SIZE_HPP
diff --git a/test_conformance/clcpp/attributes/test_required_num_sub_groups.hpp b/test_conformance/clcpp/attributes/test_required_num_sub_groups.hpp
deleted file mode 100644
index 2380eafe39..0000000000
--- a/test_conformance/clcpp/attributes/test_required_num_sub_groups.hpp
+++ /dev/null
@@ -1,285 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_REQUIRED_NUM_SUB_GROUPS_HPP
-#define TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_REQUIRED_NUM_SUB_GROUPS_HPP
-
-#include <sstream>
-#include <string>
-#include <vector>
-#include <random>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_required_num_sub_groups {
-
-struct test_options
-{
-    size_t num_sub_groups;
-    bool spec_const;
-    size_t max_count;
-    size_t num_tests;
-};
-
-struct output_type
-{
-    cl_ulong num_sub_groups;
-    cl_ulong enqueued_num_sub_groups;
-};
-
-const std::string source_common = R"(
-struct output_type
-{
-    ulong num_sub_groups;
-    ulong enqueued_num_sub_groups;
-};
-)";
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << source_common;
-    s << R"(
-    #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-
-    kernel void test(global struct output_type *output)
-    {
-        const ulong gid = get_global_linear_id();
-        output[gid].num_sub_groups = get_num_sub_groups();
-        output[gid].enqueued_num_sub_groups = get_enqueued_num_sub_groups();
-    }
-    )";
-
-    return s.str();
-}
-#else
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << R"(
-    #include <opencl_memory>
-    #include <opencl_work_item>
-    using namespace cl;
-    )";
-
-    if (options.spec_const)
-    {
-        s << "#include <opencl_spec_constant>" << std::endl;
-        s << "cl::spec_constant<uint, 1> num_sub_groups_spec{ 1234567890 };" << std::endl;
-    }
-
-    s << source_common << std::endl;
-    s << "[[cl::required_num_sub_groups(" << (options.spec_const ? "num_sub_groups_spec" : std::to_string(options.num_sub_groups)) << ")]]";
-    s << R"(
-    kernel void test(global_ptr<output_type[]> output)
-    {
-        const ulong gid = get_global_linear_id();
-        output[gid].num_sub_groups = get_num_sub_groups();
-        output[gid].enqueued_num_sub_groups = get_enqueued_num_sub_groups();
-    }
-    )";
-
-    return s.str();
-}
-#endif
-
-int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    if (!is_extension_available(device, "cl_khr_subgroups"))
-    {
-        log_info("SKIPPED: Extension `cl_khr_subgroups` is not supported. Skipping tests.\n");
-        return CL_SUCCESS;
-    }
-#endif
-
-    cl_program program;
-    cl_kernel kernel;
-
-    std::string kernel_name = "test";
-    std::string source = generate_source(options);
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name, "-cl-std=CL2.0", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    const char *source_c_str = source.c_str();
-    error = create_openclcpp_program(context, &program, 1, &source_c_str, "");
-    RETURN_ON_ERROR(error)
-
-    if (options.spec_const)
-    {
-        cl_uint spec_num_sub_groups = static_cast<cl_uint>(options.num_sub_groups);
-        error = clSetProgramSpecializationConstant(program, 1, sizeof(cl_uint), &spec_num_sub_groups);
-        RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-    }
-
-    error = build_program_create_kernel_helper(
-        context, &program, &kernel, 1, &source_c_str, kernel_name.c_str()
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    size_t compile_num_sub_groups;
-    error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_COMPILE_NUM_SUB_GROUPS,
-        0, NULL,
-        sizeof(size_t), &compile_num_sub_groups, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
-    if (compile_num_sub_groups != options.num_sub_groups)
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "CL_KERNEL_COMPILE_NUM_SUB_GROUPS did not return correct value (expected %lu, got %lu)",
-            options.num_sub_groups, compile_num_sub_groups
-        )
-    }
-
-    cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(output_type) * options.max_count, NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<size_t> count_dis(1, options.max_count);
-
-    for (size_t test = 0; test < options.num_tests; test++)
-    {
-        for (size_t dim = 1; dim <= 3; dim++)
-        {
-            size_t global_size[3] = { 1, 1, 1 };
-            size_t count = count_dis(gen);
-            std::uniform_int_distribution<size_t> global_size_dis(1, static_cast<size_t>(pow(count, 1.0 / dim)));
-            for (size_t d = 0; d < dim; d++)
-            {
-                global_size[d] = global_size_dis(gen);
-            }
-            count = global_size[0] * global_size[1] * global_size[2];
-
-            size_t local_size[3] = { 1, 1, 1 };
-            error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT,
-                sizeof(size_t), &options.num_sub_groups,
-                sizeof(size_t) * dim, local_size, NULL);
-            RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
-            if (local_size[0] == 0 || local_size[1] != 1 || local_size[2] != 1)
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT did not return correct value"
-                )
-            }
-
-            size_t sub_group_count_for_ndrange;
-            error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE,
-                sizeof(size_t) * dim, local_size,
-                sizeof(size_t), &sub_group_count_for_ndrange, NULL);
-            RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
-            if (sub_group_count_for_ndrange != options.num_sub_groups)
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE did not return correct value (expected %lu, got %lu)",
-                    options.num_sub_groups, sub_group_count_for_ndrange
-                )
-            }
-
-            const char pattern = 0;
-            error = clEnqueueFillBuffer(queue, output_buffer, &pattern, sizeof(pattern), 0, sizeof(output_type) * count, 0, NULL, NULL);
-            RETURN_ON_CL_ERROR(error, "clEnqueueFillBuffer")
-
-            error = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, global_size, local_size, 0, NULL, NULL);
-            RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-            std::vector<output_type> output(count);
-            error = clEnqueueReadBuffer(
-                queue, output_buffer, CL_TRUE,
-                0, sizeof(output_type) * count,
-                static_cast<void *>(output.data()),
-                0, NULL, NULL
-            );
-            RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-            for (size_t gid = 0; gid < count; gid++)
-            {
-                const output_type &o = output[gid];
-
-                if (o.enqueued_num_sub_groups != options.num_sub_groups)
-                {
-                    RETURN_ON_ERROR_MSG(-1, "get_enqueued_num_sub_groups does not equal to required_num_sub_groups")
-                }
-                if (o.num_sub_groups > options.num_sub_groups)
-                {
-                    RETURN_ON_ERROR_MSG(-1, "get_num_sub_groups did not return correct value")
-                }
-            }
-        }
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_required_num_sub_groups)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error = CL_SUCCESS;
-
-    cl_uint max_num_sub_groups;
-    error = clGetDeviceInfo(device, CL_DEVICE_MAX_NUM_SUB_GROUPS, sizeof(max_num_sub_groups), &max_num_sub_groups, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    for (bool spec_const : { false, true })
-    for (size_t num_sub_groups = 1; num_sub_groups <= max_num_sub_groups; num_sub_groups++)
-    {
-        test_options options;
-        options.spec_const = spec_const;
-        options.num_sub_groups = num_sub_groups;
-        options.num_tests = 100;
-        options.max_count = num_elements;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_ATTRIBUTES_TEST_REQUIRED_NUM_SUB_GROUPS_HPP
diff --git a/test_conformance/clcpp/common.hpp b/test_conformance/clcpp/common.hpp
deleted file mode 100644
index e06200265a..0000000000
--- a/test_conformance/clcpp/common.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_COMMON_INC_HPP
-#define TEST_CONFORMANCE_CLCPP_COMMON_INC_HPP
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cmath>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-// harness framework
-#include "harness/compat.h"
-#include "harness/testHarness.h"
-#include "harness/errorHelpers.h"
-#include "harness/kernelHelpers.h"
-
-// autotest
-#include "autotest/autotest.hpp"
-
-// utils_common
-#include "utils_common/is_vector_type.hpp"
-#include "utils_common/scalar_type.hpp"
-#include "utils_common/make_vector_type.hpp"
-#include "utils_common/type_name.hpp"
-#include "utils_common/type_supported.hpp"
-#include "utils_common/vector_size.hpp"
-#include "utils_common/kernel_helpers.hpp"
-#include "utils_common/errors.hpp"
-#include "utils_common/string.hpp"
-
-size_t get_uniform_global_size(size_t global_size, size_t local_size)
-{
-    return static_cast<size_t>(std::ceil(static_cast<double>(global_size) / local_size)) * local_size;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_COMMON_INC_HPP
diff --git a/test_conformance/clcpp/common_funcs/CMakeLists.txt b/test_conformance/clcpp/common_funcs/CMakeLists.txt
deleted file mode 100644
index 5e4d8b035e..0000000000
--- a/test_conformance/clcpp/common_funcs/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_COMMON_FUNCS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/common_funcs/common_funcs.hpp b/test_conformance/clcpp/common_funcs/common_funcs.hpp
deleted file mode 100644
index d6f8c89704..0000000000
--- a/test_conformance/clcpp/common_funcs/common_funcs.hpp
+++ /dev/null
@@ -1,417 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_COMMON_FUNCS_COMMON_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_COMMON_FUNCS_COMMON_FUNCS_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include <type_traits>
-#include <algorithm>
-
-// floatn clamp(floatn x, floatn min, floatn max) (only scalars)
-template<class IN1, class IN2, class IN3, class OUT1>
-struct common_func_clamp : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "clamp";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& minval, const IN3& maxval)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, IN3>::value
-                && std::is_same<IN3, OUT1>::value,
-            "All types must be the same"
-        );
-        return (std::min)((std::max)(x, minval), maxval);
-    }
-
-    IN2 min2()
-    {
-        return (std::numeric_limits<IN2>::min)();
-    }
-
-    IN2 max2()
-    {
-        return (std::numeric_limits<IN2>::max)() / IN2(4000.0f);
-    }
-
-    IN3 min3()
-    {
-        return IN3(1) + ((std::numeric_limits<IN3>::max)() / IN3(4000.0f));
-    }
-
-    IN3 max3()
-    {
-        return (std::numeric_limits<IN3>::max)() / IN3(2000.0f);
-    }
-
-    float ulp()
-    {
-        return 0.0f;
-    }
-};
-
-// floatn degrees(floatn t)
-template<class IN1, class OUT1, class REFERENCE>
-struct common_func_degrees : public unary_func<IN1, OUT1>
-{
-    std::string str()
-    {
-        return "degrees";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    REFERENCE operator()(const IN1& x)
-    {
-        static_assert(
-            std::is_same<IN1, OUT1>::value,
-            "All types must be the same"
-        );
-        return (REFERENCE(180.0) / CL_M_PI) * static_cast<REFERENCE>(x);
-    }
-
-    float ulp()
-    {
-        return 2.5f;
-    }
-};
-
-// floatn max(floatn x, floatn y)
-template<class IN1, class IN2, class OUT1>
-struct common_func_max : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "max";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value && std::is_same<IN2, OUT1>::value,
-            "All types must be the same"
-        );
-        return (std::max)(x, y);
-    }
-
-    float ulp()
-    {
-        return 0.0f;
-    }
-};
-
-// floatn min(floatn x, floatn y)
-template<class IN1, class IN2, class OUT1>
-struct common_func_min : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "min";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value && std::is_same<IN2, OUT1>::value,
-            "All types must be the same"
-        );
-        return (std::min)(x, y);
-    }
-
-    float ulp()
-    {
-        return 0.0f;
-    }
-};
-
-// floatn mix(floatn x, floatn y, floatn a);
-template<class IN1, class IN2, class IN3, class OUT1>
-struct common_func_mix : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "mix";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y, const IN3& a)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, IN3>::value
-                && std::is_same<IN3, OUT1>::value,
-            "All types must be the same"
-        );
-        return static_cast<double>(x) + ((static_cast<double>(y) - static_cast<double>(x)) * static_cast<double>(a));
-    }
-
-    IN3 min3()
-    {
-        return IN3(0.0f + CL_FLT_EPSILON);
-    }
-
-    IN3 max3()
-    {
-        return IN3(1.0f - CL_FLT_EPSILON);
-    }
-
-    bool use_ulp()
-    {
-        return false;
-    }
-};
-
-// floatn radians(floatn t)
-template<class IN1, class OUT1, class REFERENCE>
-struct common_func_radians : public unary_func<IN1, OUT1>
-{
-    std::string str()
-    {
-        return "radians";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    REFERENCE operator()(const IN1& x)
-    {
-        static_assert(
-            std::is_same<IN1, OUT1>::value,
-            "All types must be the same"
-        );
-        return (CL_M_PI / REFERENCE(180.0)) * static_cast<REFERENCE>(x);
-    }
-
-    float ulp()
-    {
-        return 2.5f;
-    }
-};
-
-// floatn step(floatn edge, floatn x)
-template<class IN1, class IN2, class OUT1>
-struct common_func_step : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "step";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    OUT1 operator()(const IN1& edge, const IN2& x)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value && std::is_same<IN2, OUT1>::value,
-            "All types must be the same"
-        );
-        if(x < edge)
-            return OUT1(0.0f);
-        return OUT1(1.0f);
-    }
-
-    float ulp()
-    {
-        return 0.0f;
-    }
-};
-
-// floatn smoothstep(floatn edge0, floatn edge1, floatn x);
-template<class IN1, class IN2, class IN3, class OUT1>
-struct common_func_smoothstep : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "smoothstep";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    OUT1 operator()(const IN1& edge0, const IN2& edge1, const IN3& x)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, IN3>::value
-                && std::is_same<IN3, OUT1>::value,
-            "All types must be the same"
-        );
-        if(x <= edge0)
-        {
-            return OUT1(0.0f);
-        }
-        if(x >= edge1)
-        {
-            return OUT1(1.0f);
-        }
-        OUT1 t = (x - edge0) / (edge1 - edge0);
-        t = t * t * (3.0f - 2.0f * t);
-        return t;
-    }
-
-    // edge0 must be < edge1
-    IN1 min1()
-    {
-        return (std::numeric_limits<IN1>::min)();
-    }
-
-    IN1 max1()
-    {
-        return (std::numeric_limits<IN1>::max)() / IN1(8000.0f);
-    }
-
-    IN2 min2()
-    {
-        return IN3(1) + ((std::numeric_limits<IN2>::max)() / IN2(4000.0f));
-    }
-
-    IN2 max2()
-    {
-        return (std::numeric_limits<IN2>::max)() / IN2(2000.0f);
-    }
-
-    bool use_ulp()
-    {
-        return false;
-    }
-};
-
-// floatn sign(floatn t)
-template<class IN1, class OUT1>
-struct common_func_sign : public unary_func<IN1, OUT1>
-{
-    std::string str()
-    {
-        return "sign";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_common>\n";
-    }
-
-    OUT1 operator()(const IN1& x)
-    {
-        static_assert(
-            std::is_same<IN1, OUT1>::value,
-            "All types must be the same"
-        );
-        if(x == IN1(-0.0f))
-        {
-            return IN1(-0.0f);
-        }
-        if(x == IN1(+0.0f))
-        {
-            return IN1(+0.0f);
-        }
-        if(x > IN1(0.0f))
-        {
-            return IN1(1.0f);
-        }
-        return IN1(-1.0f);
-    }
-
-    bool use_ulp()
-    {
-        return false;
-    }
-
-    float ulp()
-    {
-        return 0.0f;
-    }
-
-    std::vector<IN1> in_special_cases()
-    {
-        return { -0.0f, +0.0f };
-    }
-};
-
-AUTO_TEST_CASE(test_common_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // floatn clamp(floatn x, floatn min, floatn max)
-    TEST_TERNARY_FUNC_MACRO((common_func_clamp<cl_float, cl_float, cl_float, cl_float>()))  
-
-    // floatn degrees(floatn t)
-    TEST_UNARY_FUNC_MACRO((common_func_degrees<cl_float, cl_float, cl_double>()))  
-        
-    // floatn max(floatn x, floatn y);
-    TEST_BINARY_FUNC_MACRO((common_func_max<cl_float, cl_float, cl_float>()))
-
-    // floatn min(floatn x, floatn y);
-    TEST_BINARY_FUNC_MACRO((common_func_min<cl_float, cl_float, cl_float>()))
-   
-    // floatn mix(floatn x, floatn y, floatn a);
-    TEST_TERNARY_FUNC_MACRO((common_func_mix<cl_float, cl_float, cl_float, cl_float>()))
-
-    // floatn radians(floatn t)
-    TEST_UNARY_FUNC_MACRO((common_func_radians<cl_float, cl_float, cl_double>()))
-
-    // floatn step(floatn edge, floatn x)
-    TEST_BINARY_FUNC_MACRO((common_func_step<cl_float, cl_float, cl_float>()))
-
-    // floatn smoothstep(floatn edge0, floatn edge1, floatn x)
-    TEST_TERNARY_FUNC_MACRO((common_func_smoothstep<cl_float, cl_float, cl_float, cl_float>()))
-
-    // floatn sign(floatn t);
-    TEST_UNARY_FUNC_MACRO((common_func_sign<cl_float, cl_float>()))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_COMMON_FUNCS_COMMON_FUNCS_HPP
diff --git a/test_conformance/clcpp/common_funcs/main.cpp b/test_conformance/clcpp/common_funcs/main.cpp
deleted file mode 100644
index 4a6277a393..0000000000
--- a/test_conformance/clcpp/common_funcs/main.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include <limits>
-
-#include "../common.hpp"
-
-#include "common_funcs.hpp"
-
-int main(int argc, const char *argv[])
-{
-    // Check if cl_float (float) and cl_double (double) fulfill the requirements of
-    // IEC 559 (IEEE 754) standard. This is required for the tests to run correctly.
-    if(!std::numeric_limits<cl_float>::is_iec559)
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "cl_float (float) does not fulfill the requirements of IEC 559 (IEEE 754) standard. "
-            "Tests won't run correctly."
-        );
-    }
-    if(!std::numeric_limits<cl_double>::is_iec559)
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "cl_double (double) does not fulfill the requirements of IEC 559 (IEEE 754) standard. "
-            "Tests won't run correctly."
-        );
-    }
-
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/convert/CMakeLists.txt b/test_conformance/clcpp/convert/CMakeLists.txt
deleted file mode 100644
index 9f69feabb4..0000000000
--- a/test_conformance/clcpp/convert/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_CONVERT)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/convert/convert_cast.hpp b/test_conformance/clcpp/convert/convert_cast.hpp
deleted file mode 100644
index 81fcca63cc..0000000000
--- a/test_conformance/clcpp/convert/convert_cast.hpp
+++ /dev/null
@@ -1,309 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_CONVERT_CONVERT_CAST_HPP
-#define TEST_CONFORMANCE_CLCPP_CONVERT_CONVERT_CAST_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include <functional>
-
-
-enum class rounding_mode
-{
-    def,
-    /*rte, not implemented here */
-    rtz,
-    rtp,
-    rtn
-};
-
-enum class saturate { def, off, on };
-
-std::string rounding_mode_name(rounding_mode rmode)
-{
-    switch (rmode)
-    {
-        case rounding_mode::rtz: return "rtz";
-        case rounding_mode::rtp: return "rtp";
-        case rounding_mode::rtn: return "rtn";
-        default: return "";
-    }
-}
-
-std::string saturate_name(saturate smode)
-{
-    switch (smode)
-    {
-        case saturate::off: return "off";
-        case saturate::on:  return "on";
-        default: return "";
-    }
-}
-
-template<class T>
-T clamp(T x, T a, T b)
-{
-    return (std::min)(b, (std::max)(a, x));
-}
-
-template<class IN1, class OUT1>
-struct convert_cast : public unary_func<IN1, OUT1>
-{
-    static_assert(vector_size<IN1>::value == vector_size<OUT1>::value, "The operand and result type must have the same number of elements");
-
-    typedef typename scalar_type<IN1>::type in_scalar_type;
-    typedef typename scalar_type<OUT1>::type out_scalar_type;
-
-    in_scalar_type in_min;
-    in_scalar_type in_max;
-    rounding_mode rmode;
-    saturate smode;
-
-    convert_cast(in_scalar_type min, in_scalar_type max, rounding_mode rmode, saturate smode)
-        : in_min(min), in_max(max), rmode(rmode), smode(smode)
-    {
-    }
-
-    std::string str()
-    {
-        return "convert_cast";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_convert>\n";
-    }
-
-    IN1 min1()
-    {
-        return detail::def_limit<IN1>(in_min);
-    }
-
-    IN1 max1()
-    {
-        return detail::def_limit<IN1>(in_max);
-    }
-
-    OUT1 operator()(const IN1& x)
-    {
-        OUT1 y;
-        for (size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            in_scalar_type v;
-            if (smode == saturate::on)
-                v = clamp(x.s[i],
-                    static_cast<in_scalar_type>((std::numeric_limits<out_scalar_type>::min)()),
-                    static_cast<in_scalar_type>((std::numeric_limits<out_scalar_type>::max)())
-                );
-            else
-                v = x.s[i];
-
-            if (std::is_integral<out_scalar_type>::value)
-            {
-                switch (rmode)
-                {
-                    case rounding_mode::rtp:
-                        y.s[i] = static_cast<out_scalar_type>(std::ceil(v));
-                        break;
-                    case rounding_mode::rtn:
-                        y.s[i] = static_cast<out_scalar_type>(std::floor(v));
-                        break;
-                    default:
-                        y.s[i] = static_cast<out_scalar_type>(v);
-                }
-            }
-            else
-            {
-                y.s[i] = static_cast<out_scalar_type>(v);
-            }
-        }
-        return y;
-    }
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class in_type, class out_type>
-std::string generate_kernel_convert_cast(func_type func)
-{
-    std::string in1_value = "input[gid]";
-    std::string function_call = "convert_" + type_name<out_type>();
-    if (func.smode == saturate::on)
-        function_call += "_sat";
-    if (func.rmode != rounding_mode::def)
-        function_call += "_" + rounding_mode_name(func.rmode);
-    function_call += "(" + in1_value + ")";
-    return
-        "__kernel void test_" + func.str() + "(global " + type_name<in_type>() + " *input, global " + type_name<out_type>() + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#else
-template <class func_type, class in_type, class out_type>
-std::string generate_kernel_convert_cast(func_type func)
-{
-    std::string headers = func.headers();
-    std::string in1_value = "input[gid]";
-    std::string function_call = "convert_cast<" + type_name<out_type>();
-    if (func.rmode != rounding_mode::def)
-        function_call += ", rounding_mode::" + rounding_mode_name(func.rmode);
-    if (func.smode != saturate::def)
-        function_call += ", saturate::" + saturate_name(func.smode);
-    function_call += ">(" + in1_value + ")";
-    return
-        "" + func.defs() +
-        "" + headers +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_" + func.str() + "(global_ptr<" + type_name<in_type>() +  "[]> input,"
-                                              "global_ptr<" + type_name<out_type>() + "[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#endif
-
-template <class convert_cast_op>
-int test_convert_cast_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, convert_cast_op op)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int error;
-
-    typedef typename convert_cast_op::in_type INPUT;
-    typedef typename convert_cast_op::out_type OUTPUT;
-
-    // Don't run test for unsupported types
-    if (!(type_supported<INPUT>(device) && type_supported<OUTPUT>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_convert_cast<convert_cast_op, INPUT, OUTPUT>(op);
-    std::string kernel_name("test_"); kernel_name += op.str();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(error)
-#else
-    error = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(error)
-#endif
-
-    std::vector<INPUT> input = generate_input<INPUT>(count, op.min1(), op.max1(), op.in_special_cases());
-    std::vector<OUTPUT> output = generate_output<OUTPUT>(count);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(INPUT) * input.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(OUTPUT) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(INPUT) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = count;
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    error = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(OUTPUT) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    if (!verify_unary(input, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1, "test_%s %s(%s) failed", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-    }
-    log_info("test_%s %s(%s) passed\n", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-
-AUTO_TEST_CASE(test_convert_cast)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-#define TEST_CONVERT_CAST_MACRO(OP) \
-    last_error = test_convert_cast_func( \
-        device, context, queue, n_elems, OP \
-    ); \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-    // No-op
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_float2, cl_float2>(-100.0f, +100.0f, rounding_mode::rtn, saturate::def)))
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_uchar2, cl_uchar2>(0, 255, rounding_mode::def, saturate::def)))
-
-    // int to int
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_int4, cl_short4>(40000, 40000, rounding_mode::def, saturate::on)))
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_uchar8, cl_char8>(0, 127, rounding_mode::def, saturate::off)))
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_char8, cl_int8>(-100, 100, rounding_mode::def, saturate::off)))
-
-    // float to int
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_float2, cl_uchar2>(-100.0f, +400.0f, rounding_mode::def, saturate::on)))
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_double4, cl_char4>(-127.0, +127.0, rounding_mode::rtp, saturate::off)))
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_float8, cl_uint8>(-1000.0f, +10000.0f, rounding_mode::rtp, saturate::on)))
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_float16, cl_ushort16>(-10000.0f, +70000.0f, rounding_mode::rtn, saturate::on)))
-
-    // int to float
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_short8, cl_float8>(0, 12345, rounding_mode::def, saturate::def)))
-    TEST_CONVERT_CAST_MACRO((convert_cast<cl_long2, cl_float2>(-1000000, +1000000, rounding_mode::rtz, saturate::def)))
-
-#undef TEST_CONVERT_CAST_MACRO
-
-    if (error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_CONVERT_CONVERT_CAST_HPP
diff --git a/test_conformance/clcpp/convert/main.cpp b/test_conformance/clcpp/convert/main.cpp
deleted file mode 100644
index 78e3763750..0000000000
--- a/test_conformance/clcpp/convert/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "convert_cast.hpp"
-
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/device_queue/CMakeLists.txt b/test_conformance/clcpp/device_queue/CMakeLists.txt
deleted file mode 100644
index 0e1b2ee436..0000000000
--- a/test_conformance/clcpp/device_queue/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_DEVICE_QUEUE)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/device_queue/main.cpp b/test_conformance/clcpp/device_queue/main.cpp
deleted file mode 100644
index 0467b19f00..0000000000
--- a/test_conformance/clcpp/device_queue/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_enqueue.hpp"
-
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/device_queue/test_enqueue.hpp b/test_conformance/clcpp/device_queue/test_enqueue.hpp
deleted file mode 100644
index f5d4e6dc28..0000000000
--- a/test_conformance/clcpp/device_queue/test_enqueue.hpp
+++ /dev/null
@@ -1,699 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_DEVICE_QUEUE_TEST_ENQUEUE_HPP
-#define TEST_CONFORMANCE_CLCPP_DEVICE_QUEUE_TEST_ENQUEUE_HPP
-
-#include <sstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_enqueue {
-
-struct test_options
-{
-    int test;
-};
-
-struct output_type
-{
-    cl_int enqueue_kernel1_success;
-    cl_int enqueue_kernel2_success;
-    cl_int enqueue_kernel3_success;
-    cl_int enqueue_marker_success;
-    cl_int event1_is_valid;
-    cl_int event2_is_valid;
-    cl_int event3_is_valid;
-    cl_int user_event1_is_valid;
-    cl_int user_event2_is_valid;
-    cl_int values[10000];
-};
-
-const std::string source_common = R"(
-struct output_type
-{
-    int enqueue_kernel1_success;
-    int enqueue_kernel2_success;
-    int enqueue_kernel3_success;
-    int enqueue_marker_success;
-    int event1_is_valid;
-    int event2_is_valid;
-    int event3_is_valid;
-    int user_event1_is_valid;
-    int user_event2_is_valid;
-    int values[10000];
-};
-)";
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << source_common;
-    if (options.test == 0)
-    {
-        s << R"(
-    kernel void test(queue_t queue, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->enqueue_kernel2_success = 1;
-        output->enqueue_kernel3_success = 1;
-        output->enqueue_marker_success = 1;
-        output->event2_is_valid = 1;
-        output->event3_is_valid = 1;
-        output->user_event1_is_valid = 1;
-        output->user_event2_is_valid = 1;
-
-        queue_t default_queue = get_default_queue();
-
-        ndrange_t ndrange1 = ndrange_1D(get_global_size(0));
-        clk_event_t event1;
-        int status1 = enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange1, 0, NULL, &event1,
-        ^{
-            const ulong gid = get_global_id(0);
-            output->values[gid] = 1;
-        });
-        output->enqueue_kernel1_success = status1 == CLK_SUCCESS;
-        output->event1_is_valid = is_valid_event(event1);
-
-        release_event(event1);
-    }
-    )";
-    }
-    else if (options.test == 1)
-    {
-        s << R"(
-    kernel void test(queue_t queue, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->enqueue_kernel3_success = 1;
-        output->enqueue_marker_success = 1;
-        output->event3_is_valid = 1;
-        output->user_event1_is_valid = 1;
-        output->user_event2_is_valid = 1;
-
-        queue_t default_queue = get_default_queue();
-
-        ndrange_t ndrange1 = ndrange_1D(get_global_size(0) / 2);
-        clk_event_t event1;
-        int status1 = enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange1, 0, NULL, &event1,
-        ^{
-            const ulong gid = get_global_id(0);
-            output->values[gid * 2] = 1;
-        });
-        output->enqueue_kernel1_success = status1 == CLK_SUCCESS;
-        output->event1_is_valid = is_valid_event(event1);
-
-        ndrange_t ndrange2 = ndrange_1D(1, get_global_size(0) / 2, 1);
-        clk_event_t event2;
-        int status2 = enqueue_kernel(queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange2, 1, &event1, &event2,
-        ^{
-            const ulong gid = get_global_id(0);
-            output->values[(gid - 1) * 2 + 1] = 1;
-        });
-        output->enqueue_kernel2_success = status2 == CLK_SUCCESS;
-        output->event2_is_valid = is_valid_event(event2);
-
-        release_event(event1);
-        release_event(event2);
-    }
-    )";
-    }
-    else if (options.test == 2)
-    {
-        s << R"(
-    kernel void test(queue_t queue, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->enqueue_marker_success = 1;
-        output->event3_is_valid = 1;
-        output->enqueue_kernel3_success = 1;
-
-        queue_t default_queue = get_default_queue();
-
-        clk_event_t user_event1 = create_user_event();
-        retain_event(user_event1);
-        output->user_event1_is_valid = is_valid_event(user_event1);
-
-        ndrange_t ndrange1 = ndrange_1D(get_global_size(0) / 2);
-        clk_event_t event1;
-        int status1 = enqueue_kernel(queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange1, 1, &user_event1, &event1,
-        ^{
-            const ulong gid = get_global_id(0);
-            output->values[gid * 2] = 1;
-        });
-        output->enqueue_kernel1_success = status1 == CLK_SUCCESS;
-        output->event1_is_valid = is_valid_event(event1);
-        release_event(user_event1);
-
-        clk_event_t user_event2 = create_user_event();
-        output->user_event2_is_valid = is_valid_event(user_event2);
-
-        clk_event_t events[2];
-        events[0] = user_event2;
-        events[1] = user_event1;
-
-        ndrange_t ndrange2 = ndrange_1D(1, get_global_size(0) / 2, get_local_size(0));
-        clk_event_t event2;
-        int status2 = enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange2, 2, events, &event2,
-        ^(local void *p0, local void *p1, local void *p2) {
-            const ulong gid = get_global_id(0);
-            const ulong lid = get_local_id(0);
-            local int2 *l0 = (local int2 *)p0;
-            local int *l1 = (local int *)p1;
-            local int *l2 = (local int *)p2;
-            l1[get_local_size(0) - lid - 1] = gid > 0 ? 1 : 0;
-            work_group_barrier(CLK_LOCAL_MEM_FENCE);
-            if (lid < 5) l0[lid] = (int2)(3, 4);
-            if (lid < 3) l2[lid] = 5;
-            work_group_barrier(CLK_LOCAL_MEM_FENCE);
-            output->values[(gid - 1) * 2 + 1] = min(l1[lid], min(l0[0].x, l2[0]));
-        }, sizeof(int2) * 5, sizeof(int) * get_local_size(0), sizeof(int) * 3);
-        output->enqueue_kernel2_success = status2 == CLK_SUCCESS;
-        output->event2_is_valid = is_valid_event(event2);
-
-        set_user_event_status(user_event1, CL_COMPLETE);
-        set_user_event_status(user_event2, CL_COMPLETE);
-
-        release_event(user_event1);
-        release_event(user_event2);
-        release_event(event1);
-        release_event(event2);
-    }
-    )";
-    }
-    else if (options.test == 3)
-    {
-        s << R"(
-    kernel void test(queue_t queue, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->user_event2_is_valid = 1;
-
-        queue_t default_queue = get_default_queue();
-
-        ndrange_t ndrange1 = ndrange_1D(get_global_size(0) / 2);
-        clk_event_t event1;
-        int status1 = enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange1, 0, NULL, &event1,
-        ^{
-            const ulong gid = get_global_id(0);
-            output->values[gid * 2] = 20;
-        });
-        output->enqueue_kernel1_success = status1 == CLK_SUCCESS;
-        output->event1_is_valid = is_valid_event(event1);
-
-        ndrange_t ndrange2 = ndrange_1D(1, get_global_size(0) / 2, 1);
-        clk_event_t event2;
-        int status2 = enqueue_kernel(queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange2, 0, NULL, &event2,
-        ^{
-            const ulong gid = get_global_id(0);
-            output->values[(gid - 1) * 2 + 1] = 20;
-        });
-        output->enqueue_kernel2_success = status2 == CLK_SUCCESS;
-        output->event2_is_valid = is_valid_event(event2);
-
-        clk_event_t user_event1 = create_user_event();
-        output->user_event1_is_valid = is_valid_event(user_event1);
-
-        clk_event_t events[3];
-        events[0] = event2;
-        events[1] = user_event1;
-        events[2] = event1;
-
-        clk_event_t event3;
-        int status3 = enqueue_marker(queue, 3, events, &event3);
-        output->enqueue_marker_success = status3 == CLK_SUCCESS;
-        output->event3_is_valid = is_valid_event(event3);
-
-        int status4 = enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange_1D(get_global_size(0)), 1, &event3, NULL,
-        ^{
-            const ulong gid = get_global_id(0);
-            output->values[gid] /= 20;
-        });
-        output->enqueue_kernel3_success = status4 == CLK_SUCCESS;
-
-        set_user_event_status(user_event1, CL_COMPLETE);
-
-        release_event(user_event1);
-        release_event(event1);
-        release_event(event2);
-        release_event(event3);
-    }
-    )";
-    }
-
-    return s.str();
-}
-#else
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << R"(
-    #include <opencl_memory>
-    #include <opencl_common>
-    #include <opencl_work_item>
-    #include <opencl_synchronization>
-    #include <opencl_device_queue>
-    using namespace cl;
-    )";
-
-    s << source_common;
-    if (options.test == 0)
-    {
-        s << R"(
-    kernel void test(device_queue queue, global<output_type> *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->enqueue_kernel2_success = 1;
-        output->enqueue_kernel3_success = 1;
-        output->enqueue_marker_success = 1;
-        output->event2_is_valid = 1;
-        output->event3_is_valid = 1;
-        output->user_event1_is_valid = 1;
-        output->user_event2_is_valid = 1;
-
-        device_queue default_queue = get_default_device_queue();
-
-        ndrange ndrange1(get_global_size(0));
-        event event1;
-        enqueue_status status1 = default_queue.enqueue_kernel(enqueue_policy::no_wait, 0, nullptr, &event1, ndrange1,
-        [](global<output_type> *output) {
-            const ulong gid = get_global_id(0);
-            output->values[gid] = 1;
-        }, output);
-        output->enqueue_kernel1_success = status1 == enqueue_status::success;
-        output->event1_is_valid = event1.is_valid();
-
-        event1.release();
-    }
-    )";
-    }
-    else if (options.test == 1)
-    {
-        s << R"(
-    kernel void test(device_queue queue, global<output_type> *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->enqueue_kernel3_success = 1;
-        output->enqueue_marker_success = 1;
-        output->event3_is_valid = 1;
-        output->user_event1_is_valid = 1;
-        output->user_event2_is_valid = 1;
-
-        device_queue default_queue = get_default_device_queue();
-
-        ndrange ndrange1(get_global_size(0) / 2);
-        event event1;
-        enqueue_status status1 = default_queue.enqueue_kernel(enqueue_policy::wait_work_group, 0, nullptr, &event1, ndrange1,
-        [](global<output_type> *output) {
-            const ulong gid = get_global_id(0);
-            output->values[gid * 2] = 1;
-        }, output);
-        output->enqueue_kernel1_success = status1 == enqueue_status::success;
-        output->event1_is_valid = event1.is_valid();
-
-        ndrange ndrange2(1, get_global_size(0) / 2, 1);
-        event event2;
-        enqueue_status status2 = queue.enqueue_kernel(enqueue_policy::wait_kernel, 1, &event1, &event2, ndrange2,
-        [](global<output_type> *output) {
-            const ulong gid = get_global_id(0);
-            output->values[(gid - 1) * 2 + 1] = 1;
-        }, output);
-        output->enqueue_kernel2_success = status2 == enqueue_status::success;
-        output->event2_is_valid = event2.is_valid();
-
-        event1.release();
-        event2.release();
-    }
-    )";
-    }
-    else if (options.test == 2)
-    {
-        s << R"(
-    kernel void test(device_queue queue, global<output_type> *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->enqueue_marker_success = 1;
-        output->event3_is_valid = 1;
-        output->enqueue_kernel3_success = 1;
-
-        device_queue default_queue = get_default_device_queue();
-
-        event user_event1 = make_user_event();
-        user_event1.retain();
-        output->user_event1_is_valid = user_event1.is_valid();
-
-        ndrange ndrange1(get_global_size(0) / 2);
-        event event1;
-        enqueue_status status1 = queue.enqueue_kernel(enqueue_policy::wait_kernel, 1, &user_event1, &event1, ndrange1,
-        [](global<output_type> *output){
-            const ulong gid = get_global_id(0);
-            output->values[gid * 2] = 1;
-        }, output);
-        output->enqueue_kernel1_success = status1 == enqueue_status::success;
-        output->event1_is_valid = event1.is_valid();
-        user_event1.release();
-
-        event user_event2 = make_user_event();
-        output->user_event2_is_valid = user_event2.is_valid();
-
-        event events[2];
-        events[0] = user_event2;
-        events[1] = user_event1;
-
-        ndrange ndrange2(1, get_global_size(0) / 2, get_local_size(0));
-        event event2;
-        enqueue_status status2 = default_queue.enqueue_kernel(enqueue_policy::no_wait, 2, events, &event2, ndrange2,
-        [](global<output_type> *output, local_ptr<int2[]> l0, local_ptr<int[]> l1, local_ptr<int[]> l2) {
-            const ulong gid = get_global_id(0);
-            const ulong lid = get_local_id(0);
-            l1[get_local_size(0) - lid - 1] = gid > 0 ? 1 : 0;
-            work_group_barrier(mem_fence::local);
-            if (lid < 5) l0[lid] = int2(3, 4);
-            if (lid < 3) l2[lid] = 5;
-            work_group_barrier(mem_fence::local);
-            output->values[(gid - 1) * 2 + 1] = min(l1[lid], min(l0[0].x, l2[0]));
-        }, output, local_ptr<int2[]>::size_type(5), local_ptr<int[]>::size_type(get_local_size(0)), local_ptr<int[]>::size_type(3));
-        output->enqueue_kernel2_success = status2 == enqueue_status::success;
-        output->event2_is_valid = event2.is_valid();
-
-        user_event1.set_status(event_status::complete);
-        user_event2.set_status(event_status::complete);
-
-        user_event1.release();
-        user_event2.release();
-        event1.release();
-        event2.release();
-    }
-    )";
-    }
-    else if (options.test == 3)
-    {
-        s << R"(
-    kernel void test(device_queue queue, global<output_type> *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        if (gid != 0)
-            return;
-
-        output->user_event2_is_valid = 1;
-
-        device_queue default_queue = get_default_device_queue();
-
-        ndrange ndrange1(get_global_size(0) / 2);
-        event event1;
-        enqueue_status status1 = default_queue.enqueue_kernel(enqueue_policy::wait_work_group, 0, nullptr, &event1, ndrange1,
-        [](global<output_type> *output) {
-            const ulong gid = get_global_id(0);
-            output->values[gid * 2] = 20;
-        }, output);
-        output->enqueue_kernel1_success = status1 == enqueue_status::success;
-        output->event1_is_valid = event1.is_valid();
-
-        ndrange ndrange2(1, get_global_size(0) / 2, 1);
-        event event2;
-        enqueue_status status2 = queue.enqueue_kernel(enqueue_policy::wait_kernel, 0, nullptr, &event2, ndrange2,
-        [](global<output_type> *output) {
-            const ulong gid = get_global_id(0);
-            output->values[(gid - 1) * 2 + 1] = 20;
-        }, output);
-        output->enqueue_kernel2_success = status2 == enqueue_status::success;
-        output->event2_is_valid = event2.is_valid();
-
-        event user_event1 = make_user_event();
-        output->user_event1_is_valid = user_event1.is_valid();
-
-        event events[3];
-        events[0] = event2;
-        events[1] = user_event1;
-        events[2] = event1;
-
-        event event3;
-        enqueue_status status3 = queue.enqueue_marker(3, events, &event3);
-        output->enqueue_marker_success = status3 == enqueue_status::success;
-        output->event3_is_valid = event3.is_valid();
-
-        enqueue_status status4 = default_queue.enqueue_kernel(enqueue_policy::no_wait, 1, &event3, nullptr, ndrange(get_global_size(0)),
-        [](global<output_type> *output) {
-            const ulong gid = get_global_id(0);
-            output->values[gid] /= 20;
-        }, output);
-        output->enqueue_kernel3_success = status4 == enqueue_status::success;
-
-        user_event1.set_status(event_status::complete);
-
-        user_event1.release();
-        event1.release();
-        event2.release();
-        event3.release();
-    }
-    )";
-    }
-
-    return s.str();
-}
-#endif
-
-int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-    cl_program program;
-    cl_kernel kernel;
-
-    std::string kernel_name = "test";
-    std::string source = generate_source(options);
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name, "-cl-std=CL2.0", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    cl_uint max_queues;
-    error = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(cl_uint), &max_queues, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    cl_uint max_events;
-    error = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_EVENTS, sizeof(cl_uint), &max_events, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    cl_command_queue device_queue1 = NULL;
-    cl_command_queue device_queue2 = NULL;
-
-    cl_queue_properties queue_properties1[] =
-    {
-        CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT,
-        0
-    };
-    device_queue1 = clCreateCommandQueueWithProperties(context, device, queue_properties1, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateCommandQueueWithProperties")
-
-    if (max_queues > 1)
-    {
-        cl_queue_properties queue_properties2[] =
-        {
-            CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE,
-            0
-        };
-        device_queue2 = clCreateCommandQueueWithProperties(context, device, queue_properties2, &error);
-        RETURN_ON_CL_ERROR(error, "clCreateCommandQueueWithProperties")
-    }
-
-    cl_mem output_buffer;
-    output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(output_type), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(cl_command_queue), device_queue2 != NULL ? &device_queue2 : &device_queue1);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    const char pattern = 0;
-    error = clEnqueueFillBuffer(queue, output_buffer, &pattern, sizeof(pattern), 0, sizeof(output_type), 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueFillBuffer")
-
-    size_t max_work_group_size;
-    error = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    const size_t local_size = (std::min)((size_t)256, max_work_group_size);
-    const size_t global_size = 10000 / local_size * local_size;
-    const size_t count = global_size;
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    output_type output;
-    error = clEnqueueReadBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(output_type),
-        static_cast<void *>(&output),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    if (!output.enqueue_kernel1_success)
-    {
-        RETURN_ON_ERROR_MSG(-1, "enqueue_kernel did not succeed")
-    }
-    if (!output.enqueue_kernel2_success)
-    {
-        RETURN_ON_ERROR_MSG(-1, "enqueue_kernel did not succeed")
-    }
-    if (!output.enqueue_kernel3_success)
-    {
-        RETURN_ON_ERROR_MSG(-1, "enqueue_kernel did not succeed")
-    }
-    if (!output.enqueue_marker_success)
-    {
-        RETURN_ON_ERROR_MSG(-1, "enqueue_marker did not succeed")
-    }
-    if (!output.event1_is_valid)
-    {
-        RETURN_ON_ERROR_MSG(-1, "event1 is not valid")
-    }
-    if (!output.event2_is_valid)
-    {
-        RETURN_ON_ERROR_MSG(-1, "event2 is not valid")
-    }
-    if (!output.event3_is_valid)
-    {
-        RETURN_ON_ERROR_MSG(-1, "event3 is not valid")
-    }
-    if (!output.user_event1_is_valid)
-    {
-        RETURN_ON_ERROR_MSG(-1, "user_event1 is not valid")
-    }
-    if (!output.user_event2_is_valid)
-    {
-        RETURN_ON_ERROR_MSG(-1, "user_event2 is not valid")
-    }
-
-    for (size_t i = 0; i < count; i++)
-    {
-        const cl_int result = output.values[i];
-        const cl_int expected = 1;
-
-        if (result != expected)
-        {
-            RETURN_ON_ERROR_MSG(-1,
-                "kernel did not return correct value. Expected: %s, got: %s",
-                format_value(expected).c_str(), format_value(result).c_str()
-            )
-        }
-    }
-
-    clReleaseMemObject(output_buffer);
-    clReleaseCommandQueue(device_queue1);
-    if (device_queue2 != NULL)
-        clReleaseCommandQueue(device_queue2);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_enqueue_one_kernel)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.test = 0;
-    return test(device, context, queue, options);
-}
-
-AUTO_TEST_CASE(test_enqueue_two_kernels)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.test = 1;
-    return test(device, context, queue, options);
-}
-
-AUTO_TEST_CASE(test_enqueue_user_events_and_locals)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.test = 2;
-    return test(device, context, queue, options);
-}
-
-AUTO_TEST_CASE(test_enqueue_marker)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.test = 3;
-    return test(device, context, queue, options);
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_DEVICE_QUEUE_TEST_ENQUEUE_HPP
diff --git a/test_conformance/clcpp/funcs_test_utils.hpp b/test_conformance/clcpp/funcs_test_utils.hpp
deleted file mode 100644
index e839231ca5..0000000000
--- a/test_conformance/clcpp/funcs_test_utils.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_FUNCS_TEST_UTILS_HPP
-#define TEST_CONFORMANCE_CLCPP_FUNCS_TEST_UTILS_HPP
-
-// This file contains helper classes and functions for testing various unary, binary
-// and ternary OpenCL functions (for example cl::abs(x) or cl::abs_diff(x, y)), 
-// as well as other helper functions/classes.
-
-#include "common.hpp"
-
-#define TEST_UNARY_FUNC_MACRO(TEST_CLASS) \
-    last_error = test_unary_func(  \
-        device, context, queue, n_elems, TEST_CLASS \
-    );  \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-#define TEST_BINARY_FUNC_MACRO(TEST_CLASS) \
-    last_error = test_binary_func(  \
-        device, context, queue, n_elems, TEST_CLASS \
-    );  \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-#define TEST_TERNARY_FUNC_MACRO(TEST_CLASS) \
-    last_error = test_ternary_func(  \
-        device, context, queue, n_elems, TEST_CLASS \
-    );  \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-#include "utils_test/compare.hpp"
-#include "utils_test/generate_inputs.hpp"
-
-// HOWTO:
-//
-// unary_func, binary_func, ternary_func - base classes wrapping OpenCL functions that
-// you want to test.
-// 
-// To create a wrapper class for given function, you need to create a class derived from correct
-// base class (unary_func, binary_func, ternary_func), and define:
-//
-// * std::string str() method which should return class name in OpenCL ("abs", "abs_diff"),
-// * operator(x), operator(x, y) or operator(x,y,z) depending on arity of the function you wish
-// to test, method should work exactly as the tested function works in OpenCL
-// * if it's needed you can overload min1, max1, min2, max2, min3, max3 methods with returns min 
-// and max values that can be generated for given input (function argument) [required for vec 
-// arguments],
-// * if you want to use vector arguments (for example: cl_int2, cl_ulong16), you should look at
-// how int_func_clamp<> is implemented in integer_funcs/numeric_funcs.hpp.
-//
-// To see how you should use class you've just created see AUTO_TEST_CASE(test_int_numeric_funcs)
-// in integer_funcs/numeric_funcs.hpp.
-#include "utils_test/unary.hpp"
-#include "utils_test/binary.hpp"
-#include "utils_test/ternary.hpp"
-
-#endif // TEST_CONFORMANCE_CLCPP_FUNCS_TEST_UTILS_HPP
diff --git a/test_conformance/clcpp/geometric_funcs/CMakeLists.txt b/test_conformance/clcpp/geometric_funcs/CMakeLists.txt
deleted file mode 100644
index 25d05ed14c..0000000000
--- a/test_conformance/clcpp/geometric_funcs/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_GEOMETRIC_FUNCS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/geometric_funcs/fast_geometric_funcs.hpp b/test_conformance/clcpp/geometric_funcs/fast_geometric_funcs.hpp
deleted file mode 100644
index c179728889..0000000000
--- a/test_conformance/clcpp/geometric_funcs/fast_geometric_funcs.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_GEOMETRIC_FUNCS_FAST_GEOMETRIC_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_GEOMETRIC_FUNCS_FAST_GEOMETRIC_FUNCS_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include <type_traits>
-
-// float fast_distance(float4 p0, float4 p1);
-struct geometric_func_fast_distance : public binary_func<cl_float4, cl_float4, cl_float>
-{
-
-    std::string str()
-    {
-        return "fast_distance";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float operator()(const cl_float4& p0, const cl_float4& p1)
-    {
-        cl_double r = 0.0f;
-        cl_double t;
-        for(size_t i = 0; i < 4; i++)
-        {
-            t = static_cast<cl_double>(p0.s[i]) - static_cast<cl_double>(p1.s[i]);
-            r += t * t;
-        }
-        return std::sqrt(r);
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-512.0f);
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(512.0f);
-    }
-
-    cl_float4 min2()
-    {
-        return detail::def_limit<cl_float4>(-512.0f);
-    }
-
-    cl_float4 max2()
-    {
-        return detail::def_limit<cl_float4>(512.0f);
-    }
-
-    cl_double delta(const cl_float4& p0, const cl_float4& p1, const cl_float& expected)
-    {
-        (void) p0; (void) p1;
-        return 0.01f * expected;
-    }
-
-    float ulp()
-    {
-        return
-            8192.0f + // error in sqrt
-            (1.5f * 4.0f) + // cumulative error for multiplications
-            (0.5f * 3.0f);  // cumulative error for additions
-    }
-};
-
-// float fast_length(float4 p);
-struct geometric_func_fast_length : public unary_func<cl_float4,cl_float>
-{
-    std::string str()
-    {
-        return "fast_length";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float operator()(const cl_float4& p)
-    {
-        cl_double r = 0.0f;
-        for(size_t i = 0; i < 4; i++)
-        {
-            r += static_cast<cl_double>(p.s[i]) * static_cast<cl_double>(p.s[i]);
-        }
-        return std::sqrt(r);
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-512.0f);
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(512.0f);
-    }
-
-    cl_double delta(const cl_float4& p, const cl_float& expected)
-    {
-        (void) p;
-        return 0.01f * expected;
-    }
-
-    float ulp()
-    {
-        return
-            8192.0f + // error in sqrt
-            0.5f * // effect on e of taking sqrt( x + e )
-                ((0.5f * 4.0f) + // cumulative error for multiplications
-                (0.5f * 3.0f));  // cumulative error for additions
-    }
-};
-
-// float4 fast_normalize(float4 p);
-struct geometric_func_fast_normalize : public unary_func<cl_float4,cl_float4>
-{
-    std::string str()
-    {
-        return "fast_normalize";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float4 operator()(const cl_float4& p)
-    {
-        cl_double t = 0.0f;
-        cl_float4 r;
-        for(size_t i = 0; i < 4; i++)
-        {
-            t += static_cast<cl_double>(p.s[i]) * static_cast<cl_double>(p.s[i]);
-        }
-
-        if(t == 0.0f)
-        {
-            for(size_t i = 0; i < 4; i++)
-            {
-                r.s[i] = 0.0f;
-            }
-            return r;
-        }
-
-        t = std::sqrt(t);
-        for(size_t i = 0; i < 4; i++)
-        {
-            r.s[i] = static_cast<cl_double>(p.s[i]) / t;
-        }
-        return r;
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-512.0f);
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(512.0f);
-    }
-
-    std::vector<cl_float4> in_special_cases()
-    {
-        return {
-            {0.0f, 0.0f, 0.0f, 0.0f}
-        };
-    }
-
-
-    cl_double4 delta(const cl_float4& p, const cl_float4& expected)
-    {
-        (void) p;
-        auto e = detail::make_value<cl_double4>(0.01f);
-        return detail::multiply<cl_double4>(e, expected);
-    }
-
-    float ulp()
-    {
-        return
-            8192.5f + // error in rsqrt + error in multiply
-            (0.5f * 4.0f) + // cumulative error for multiplications
-            (0.5f * 3.0f);  // cumulative error for additions
-    }
-};
-
-AUTO_TEST_CASE(test_fast_geometric_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // float fast_distance(float4 p0, float4 p1)
-    TEST_BINARY_FUNC_MACRO((geometric_func_fast_distance()))
-
-    // float fast_length(float4 p)
-    TEST_UNARY_FUNC_MACRO((geometric_func_fast_length()))
-
-    // float4 fast_normalize(float4 p)
-    TEST_UNARY_FUNC_MACRO((geometric_func_fast_normalize()))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_GEOMETRIC_FUNCS_FAST_GEOMETRIC_FUNCS_HPP
diff --git a/test_conformance/clcpp/geometric_funcs/geometric_funcs.hpp b/test_conformance/clcpp/geometric_funcs/geometric_funcs.hpp
deleted file mode 100644
index 561f9e9bd1..0000000000
--- a/test_conformance/clcpp/geometric_funcs/geometric_funcs.hpp
+++ /dev/null
@@ -1,389 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_GEOMETRIC_FUNCS_GEOMETRIC_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_GEOMETRIC_FUNCS_GEOMETRIC_FUNCS_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include <type_traits>
-
-// float4 cross(float4 p0, float4 p1)
-struct geometric_func_cross : public binary_func<cl_float4, cl_float4, cl_float4>
-{
-    geometric_func_cross(cl_device_id device)
-    {
-        // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
-        this->m_delta = 3.0f * CL_FLT_EPSILON;
-        // RTZ devices accrue approximately double the amount of error per operation.  Allow for that.
-        if(get_default_rounding_mode(device) == CL_FP_ROUND_TO_ZERO)
-        {
-            this->m_delta *= 2.0f;
-        }
-    }
-
-    std::string str()
-    {
-        return "cross";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float4 operator()(const cl_float4& p0, const cl_float4& p1)
-    {
-        cl_float4 r;
-        r.s[0] = (p0.s[1] * p1.s[2]) - (p0.s[2] * p1.s[1]);
-        r.s[1] = (p0.s[2] * p1.s[0]) - (p0.s[0] * p1.s[2]);
-        r.s[2] = (p0.s[0] * p1.s[1]) - (p0.s[1] * p1.s[0]);
-        r.s[3] = 0.0f;
-        return r;
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 max2()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    cl_float4 min2()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    bool use_ulp()
-    {
-        return false;
-    }
-
-    cl_double4 delta(const cl_float4& p0, const cl_float4& p1, const cl_float4& expected)
-    {
-        (void) p0; (void) p1;
-        auto e = detail::make_value<cl_double4>(m_delta);
-        return detail::multiply<cl_double4>(e, expected);
-    }
-
-private:
-    cl_double m_delta;
-};
-
-// float dot(float4 p0, float4 p1);
-struct geometric_func_dot : public binary_func<cl_float4, cl_float4, cl_float>
-{
-
-    std::string str()
-    {
-        return "dot";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float operator()(const cl_float4& p0, const cl_float4& p1)
-    {
-        cl_float r;
-        r = p0.s[0] * p1.s[0];
-        r += p0.s[1] * p1.s[1];
-        r += p0.s[2] * p1.s[2];
-        r += p0.s[3] * p1.s[3];
-        return r;
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 max2()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    cl_float4 min2()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    bool use_ulp()
-    {
-        return false;
-    }
-
-    cl_double delta(const cl_float4& p0, const cl_float4& p1, cl_float expected)
-    {
-        (void) p0; (void) p1;
-        return expected * ((4.0f + (4.0f - 1.0f)) * CL_FLT_EPSILON);
-    }
-};
-
-// float distance(float4 p0, float4 p1);
-struct geometric_func_distance : public binary_func<cl_float4, cl_float4, cl_float>
-{
-
-    std::string str()
-    {
-        return "distance";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float operator()(const cl_float4& p0, const cl_float4& p1)
-    {
-        cl_double r = 0.0f;
-        cl_double t;
-        for(size_t i = 0; i < 4; i++)
-        {
-            t = static_cast<cl_double>(p0.s[i]) - static_cast<cl_double>(p1.s[i]);
-            r += t * t;
-        }
-        return std::sqrt(r);
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 max2()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    cl_float4 min2()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    float ulp()
-    {
-        return
-            3.0f + // error in sqrt
-            (1.5f * 4.0f) + // cumulative error for multiplications
-            (0.5f * 3.0f);  // cumulative error for additions
-    }
-};
-
-// float length(float4 p);
-struct geometric_func_length : public unary_func<cl_float4,cl_float>
-{
-
-    std::string str()
-    {
-        return "length";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float operator()(const cl_float4& p)
-    {
-        cl_double r = 0.0f;
-        for(size_t i = 0; i < 4; i++)
-        {
-            r += static_cast<cl_double>(p.s[i]) * static_cast<cl_double>(p.s[i]);
-        }
-        return std::sqrt(r);
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    float ulp()
-    {
-        return
-            3.0f + // error in sqrt
-            0.5f * // effect on e of taking sqrt( x + e )
-                ((0.5f * 4.0f) + // cumulative error for multiplications
-                (0.5f * 3.0f));  // cumulative error for additions
-    }
-};
-
-// float4 normalize(float4 p);
-struct geometric_func_normalize : public unary_func<cl_float4,cl_float4>
-{
-    std::string str()
-    {
-        return "normalize";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_geometric>\n";
-    }
-
-    cl_float4 operator()(const cl_float4& p)
-    {
-        cl_double t = 0.0f;
-        cl_float4 r;
-
-        // normalize( v ) returns a vector full of NaNs if any element is a NaN.
-        for(size_t i = 0; i < 4; i++)
-        {
-            if((std::isnan)(p.s[i]))
-            {
-                for(size_t j = 0; j < 4; j++)
-                {
-                    r.s[j] = p.s[i];
-                }
-                return r;
-            }
-        }
-
-        // normalize( v ) for which any element in v is infinite shall proceed as
-        // if the elements in v were replaced as follows:
-        // for( i = 0; i < sizeof(v) / sizeof(v[0] ); i++ )
-        //     v[i] = isinf(v[i]) ? copysign(1.0, v[i]) : 0.0 * v [i];
-        for(size_t i = 0; i < 4; i++)
-        {
-            if((std::isinf)(p.s[i]))
-            {
-                for(size_t j = 0; j < 4; j++)
-                {
-                    r.s[j] = (std::isinf)(p.s[j]) ? (std::copysign)(1.0, p.s[j]) : 0.0 * p.s[j];
-                }
-                r = (*this)(r);
-                return r;
-            }
-        }
-
-        for(size_t i = 0; i < 4; i++)
-        {
-            t += static_cast<cl_double>(p.s[i]) * static_cast<cl_double>(p.s[i]);
-        }
-
-        // normalize( v ) returns v if all elements of v are zero.
-        if(t == 0.0f)
-        {
-            for(size_t i = 0; i < 4; i++)
-            {
-                r.s[i] = 0.0f;
-            }
-            return r;
-        }
-
-        t = std::sqrt(t);
-        for(size_t i = 0; i < 4; i++)
-        {
-            r.s[i] = static_cast<cl_double>(p.s[i]) / t;
-        }
-
-        return r;
-    }
-
-    cl_float4 max1()
-    {
-        return detail::def_limit<cl_float4>(1000.0f);
-    }
-
-    cl_float4 min1()
-    {
-        return detail::def_limit<cl_float4>(-1000.0f);
-    }
-
-    std::vector<cl_float4> in_special_cases()
-    {
-        return {
-            {0.0f, 0.0f, 0.0f, 0.0f},
-            {std::numeric_limits<float>::infinity(), 0.0f, 0.0f, 0.0f},
-            {
-                std::numeric_limits<float>::infinity(),
-                std::numeric_limits<float>::infinity(),
-                std::numeric_limits<float>::infinity(),
-                std::numeric_limits<float>::infinity()
-            },
-            {
-                std::numeric_limits<float>::infinity(),
-                1.0f,
-                0.0f,
-                std::numeric_limits<float>::quiet_NaN()
-            },
-            {-1.0f, -1.0f, 0.0f,-300.0f}
-        };
-    }
-
-    float ulp()
-    {
-        return
-            2.5f + // error in rsqrt + error in multiply
-            (0.5f * 4.0f) + // cumulative error for multiplications
-            (0.5f * 3.0f);  // cumulative error for additions
-    }
-};
-
-AUTO_TEST_CASE(test_geometric_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // float4 cross(float4 p0, float4 p1)
-    TEST_BINARY_FUNC_MACRO((geometric_func_cross(device)))
-
-    // float dot(float4 p0, float4 p1)
-    TEST_BINARY_FUNC_MACRO((geometric_func_dot()))
-
-    // float distance(float4 p0, float4 p1)
-    TEST_BINARY_FUNC_MACRO((geometric_func_distance()))
-
-    // float length(float4 p)
-    TEST_UNARY_FUNC_MACRO((geometric_func_length()))
-
-    // float4 normalize(float4 p)
-    TEST_UNARY_FUNC_MACRO((geometric_func_normalize()))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_GEOMETRIC_FUNCS_GEOMETRIC_FUNCS_HPP
diff --git a/test_conformance/clcpp/geometric_funcs/main.cpp b/test_conformance/clcpp/geometric_funcs/main.cpp
deleted file mode 100644
index ed35805c95..0000000000
--- a/test_conformance/clcpp/geometric_funcs/main.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include <limits>
-
-#include "../common.hpp"
-
-#include "geometric_funcs.hpp"
-#include "fast_geometric_funcs.hpp"
-
-int main(int argc, const char *argv[])
-{
-    // Check if cl_float (float) and cl_double (double) fulfill the requirements of
-    // IEC 559 (IEEE 754) standard. This is required for the tests to run correctly.
-    if(!std::numeric_limits<cl_float>::is_iec559)
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "cl_float (float) does not fulfill the requirements of IEC 559 (IEEE 754) standard. "
-            "Tests won't run correctly."
-        );
-    }
-    if(!std::numeric_limits<cl_double>::is_iec559)
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "cl_double (double) does not fulfill the requirements of IEC 559 (IEEE 754) standard. "
-            "Tests won't run correctly."
-        );
-    }
-
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/images/CMakeLists.txt b/test_conformance/clcpp/images/CMakeLists.txt
deleted file mode 100644
index 3c92ecd7df..0000000000
--- a/test_conformance/clcpp/images/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_IMAGES)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/images/common.hpp b/test_conformance/clcpp/images/common.hpp
deleted file mode 100644
index 957d266dcf..0000000000
--- a/test_conformance/clcpp/images/common.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_IMAGES_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_IMAGES_COMMON_HPP
-
-#include <type_traits>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include "../harness/imageHelpers.h"
-
-
-namespace detail
-{
-
-template<cl_channel_type channel_type>
-struct channel_info;
-
-template<>
-struct channel_info<CL_SIGNED_INT8>
-{
-    typedef cl_char channel_type;
-    typedef cl_int4 element_type;
-    static std::string function_suffix() { return "i"; }
-
-    channel_type channel_min() { return (std::numeric_limits<channel_type>::min)(); }
-    channel_type channel_max() { return (std::numeric_limits<channel_type>::max)(); }
-};
-
-template<>
-struct channel_info<CL_SIGNED_INT16>
-{
-    typedef cl_short channel_type;
-    typedef cl_int4 element_type;
-    static std::string function_suffix() { return "i"; }
-
-    channel_type channel_min() { return (std::numeric_limits<channel_type>::min)(); }
-    channel_type channel_max() { return (std::numeric_limits<channel_type>::max)(); }
-};
-
-template<>
-struct channel_info<CL_SIGNED_INT32>
-{
-    typedef cl_int channel_type;
-    typedef cl_int4 element_type;
-    static std::string function_suffix() { return "i"; }
-
-    channel_type channel_min() { return (std::numeric_limits<channel_type>::min)(); }
-    channel_type channel_max() { return (std::numeric_limits<channel_type>::max)(); }
-};
-
-template<>
-struct channel_info<CL_UNSIGNED_INT8>
-{
-    typedef cl_uchar channel_type;
-    typedef cl_uint4 element_type;
-    static std::string function_suffix() { return "ui"; }
-
-    channel_type channel_min() { return (std::numeric_limits<channel_type>::min)(); }
-    channel_type channel_max() { return (std::numeric_limits<channel_type>::max)(); }
-};
-
-template<>
-struct channel_info<CL_UNSIGNED_INT16>
-{
-    typedef cl_ushort channel_type;
-    typedef cl_uint4 element_type;
-    static std::string function_suffix() { return "ui"; }
-
-    channel_type channel_min() { return (std::numeric_limits<channel_type>::min)(); }
-    channel_type channel_max() { return (std::numeric_limits<channel_type>::max)(); }
-};
-
-template<>
-struct channel_info<CL_UNSIGNED_INT32>
-{
-    typedef cl_uint channel_type;
-    typedef cl_uint4 element_type;
-    static std::string function_suffix() { return "ui"; }
-
-    channel_type channel_min() { return (std::numeric_limits<channel_type>::min)(); }
-    channel_type channel_max() { return (std::numeric_limits<channel_type>::max)(); }
-};
-
-template<>
-struct channel_info<CL_FLOAT>
-{
-    typedef cl_float channel_type;
-    typedef cl_float4 element_type;
-    static std::string function_suffix() { return "f"; }
-
-    channel_type channel_min() { return -1e-3f; }
-    channel_type channel_max() { return +1e+3f; }
-};
-
-template<cl_mem_object_type image_type>
-struct image_info;
-
-template<>
-struct image_info<CL_MEM_OBJECT_IMAGE1D>
-{
-    static std::string image_type_name() { return "image1d"; }
-    static std::string coord_accessor() { return "x"; }
-};
-
-template<>
-struct image_info<CL_MEM_OBJECT_IMAGE2D>
-{
-    static std::string image_type_name() { return "image2d"; }
-    static std::string coord_accessor() { return "xy"; }
-};
-
-template<>
-struct image_info<CL_MEM_OBJECT_IMAGE3D>
-{
-    static std::string image_type_name() { return "image3d"; }
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    static std::string coord_accessor() { return "xyzw"; }
-#else
-    static std::string coord_accessor() { return "xyz"; }
-#endif
-};
-
-} // namespace
-
-template<cl_mem_object_type ImageType, cl_channel_type ChannelType>
-struct image_test_base :
-    detail::channel_info<ChannelType>,
-    detail::image_info<ImageType>
-{ };
-
-// Create image_descriptor (used by harness/imageHelpers functions)
-image_descriptor create_image_descriptor(cl_image_desc &image_desc, cl_image_format *image_format)
-{
-    image_descriptor image_info;
-    image_info.width = image_desc.image_width;
-    image_info.height = image_desc.image_height;
-    image_info.depth = image_desc.image_depth;
-    image_info.arraySize = image_desc.image_array_size;
-    image_info.rowPitch = image_desc.image_row_pitch;
-    image_info.slicePitch = image_desc.image_slice_pitch;
-    image_info.format = image_format;
-    image_info.buffer = image_desc.mem_object;
-    image_info.type = image_desc.image_type;
-    image_info.num_mip_levels = image_desc.num_mip_levels;
-    return image_info;
-}
-
-const std::vector<cl_channel_order> get_channel_orders(cl_device_id device)
-{
-    // According to "Minimum List of Supported Image Formats" of OpenCL specification:
-    return { CL_R, CL_RG, CL_RGBA };
-}
-
-bool is_test_supported(cl_device_id device)
-{
-    // Check for image support
-    if (checkForImageSupport(device) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
-    {
-        log_info("SKIPPED: Device does not support images. Skipping test.\n");
-        return false;
-    }
-    return true;
-}
-
-// Checks if x is equal to y.
-template<class type>
-inline bool are_equal(const type& x,
-                      const type& y)
-{
-    for(size_t i = 0; i < vector_size<type>::value; i++)
-    {
-        if(!(x.s[i] == y.s[i]))
-        {
-            return false;
-        }
-    }
-    return true;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_IMAGES_COMMON_HPP
diff --git a/test_conformance/clcpp/images/main.cpp b/test_conformance/clcpp/images/main.cpp
deleted file mode 100644
index bbda559d97..0000000000
--- a/test_conformance/clcpp/images/main.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_read.hpp"
-#include "test_sample.hpp"
-#include "test_write.hpp"
-
-// FIXME: To use certain functions in test_common/harness/imageHelpers.h
-// (for example, generate_random_image_data()), the tests are required to declare
-// the following variable (hangover from code specific to Apple's implementation):
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/images/test_read.hpp b/test_conformance/clcpp/images/test_read.hpp
deleted file mode 100644
index 3bc7b5efb2..0000000000
--- a/test_conformance/clcpp/images/test_read.hpp
+++ /dev/null
@@ -1,307 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_IMAGES_TEST_READ_HPP
-#define TEST_CONFORMANCE_CLCPP_IMAGES_TEST_READ_HPP
-
-#include <sstream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "common.hpp"
-
-
-namespace test_images_read {
-
-template<cl_mem_object_type ImageType, cl_channel_type ChannelType>
-struct image_test : image_test_base<ImageType, ChannelType>
-{
-    cl_channel_order channel_order;
-
-    image_test(cl_channel_order channel_order) :
-        channel_order(channel_order)
-    { }
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    std::string generate_source()
-    {
-        std::stringstream s;
-        s << R"(
-        typedef )" << type_name<typename image_test::element_type>() << R"( element_type;
-
-        kernel void test(
-            read_only )" << image_test::image_type_name() << R"(_t img,
-            const global int4 *coords,
-            global element_type *output
-        ) {
-            const ulong gid = get_global_linear_id();
-
-            output[gid] = read_image)" << image_test::function_suffix() <<
-                "(img, coords[gid]." << image_test::coord_accessor() << R"();
-        }
-        )";
-
-        return s.str();
-    }
-#else
-    std::string generate_source()
-    {
-        std::stringstream s;
-        s << R"(
-        #include <opencl_memory>
-        #include <opencl_common>
-        #include <opencl_work_item>
-        #include <opencl_image>
-        using namespace cl;
-        )";
-
-        s << R"(
-        typedef )" << type_name<typename image_test::element_type>() <<  R"( element_type;
-
-        kernel void test(
-            const )" << image_test::image_type_name() << R"(<element_type, image_access::read> img,
-            const global_ptr<int4[]> coords,
-            global_ptr<element_type[]> output
-        ) {
-            const ulong gid = get_global_linear_id();
-
-            output[gid] = img.read(coords[gid].)" << image_test::coord_accessor() << R"();
-        }
-        )";
-
-        return s.str();
-    }
-#endif
-
-    int run(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-    {
-        int error = CL_SUCCESS;
-
-        cl_program program;
-        cl_kernel kernel;
-
-        std::string kernel_name = "test";
-        std::string source = generate_source();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name
-        );
-        RETURN_ON_ERROR(error)
-        return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name, "-cl-std=CL2.0", false
-        );
-        RETURN_ON_ERROR(error)
-// Normal run
-#else
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name
-        );
-        RETURN_ON_ERROR(error)
-#endif
-
-        using element_type = typename image_test::element_type;
-        using coord_type = cl_int4;
-        using scalar_element_type = typename scalar_type<element_type>::type;
-        using channel_type = typename image_test::channel_type;
-
-        cl_image_format image_format;
-        image_format.image_channel_order = channel_order;
-        image_format.image_channel_data_type = ChannelType;
-
-        const size_t pixel_size = get_pixel_size(&image_format);
-        const size_t channel_count = get_channel_order_channel_count(image_format.image_channel_order);
-
-        cl_image_desc image_desc;
-        image_desc.image_type = ImageType;
-        if (ImageType == CL_MEM_OBJECT_IMAGE1D)
-        {
-            image_desc.image_width = 2048;
-            image_desc.image_height = 1;
-            image_desc.image_depth = 1;
-        }
-        else if (ImageType == CL_MEM_OBJECT_IMAGE2D)
-        {
-            image_desc.image_width = 256;
-            image_desc.image_height = 256;
-            image_desc.image_depth = 1;
-        }
-        else if (ImageType == CL_MEM_OBJECT_IMAGE3D)
-        {
-            image_desc.image_width = 64;
-            image_desc.image_height = 64;
-            image_desc.image_depth = 64;
-        }
-        image_desc.image_array_size = 0;
-        image_desc.image_row_pitch = image_desc.image_width * pixel_size;
-        image_desc.image_slice_pitch = image_desc.image_row_pitch * image_desc.image_height;
-        image_desc.num_mip_levels = 0;
-        image_desc.num_samples = 0;
-        image_desc.mem_object = NULL;
-
-        image_descriptor image_info = create_image_descriptor(image_desc, &image_format);
-
-        std::vector<channel_type> image_values = generate_input(
-            image_desc.image_width * image_desc.image_height * image_desc.image_depth * channel_count,
-            image_test::channel_min(), image_test::channel_max(),
-            std::vector<channel_type>()
-        );
-
-        const size_t count = num_elements;
-
-        std::vector<coord_type> coords = generate_input(
-            count,
-            detail::make_value<coord_type>(0),
-            coord_type {
-                static_cast<cl_int>(image_desc.image_width - 1),
-                static_cast<cl_int>(image_desc.image_height - 1),
-                static_cast<cl_int>(image_desc.image_depth - 1),
-                0
-            },
-            std::vector<coord_type>()
-        );
-
-        cl_mem img = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-            &image_format, &image_desc, static_cast<void *>(image_values.data()), &error);
-        RETURN_ON_CL_ERROR(error, "clCreateImage")
-
-        cl_mem coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-            sizeof(coord_type) * count, static_cast<void *>(coords.data()), &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(element_type) * count, NULL, &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &img);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 1, sizeof(coords_buffer), &coords_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-        const size_t global_size = count;
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
-        RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-        std::vector<element_type> output(count);
-        error = clEnqueueReadBuffer(
-            queue, output_buffer, CL_TRUE,
-            0, sizeof(element_type) * count,
-            static_cast<void *>(output.data()),
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-        for (size_t i = 0; i < count; i++)
-        {
-            const coord_type c = coords[i];
-            const element_type result = output[i];
-
-            element_type expected;
-            read_image_pixel<scalar_element_type>(static_cast<void *>(image_values.data()), &image_info,
-                c.s[0], c.s[1], c.s[2],
-                expected.s);
-
-            if (!are_equal(result, expected))
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "Reading from coordinates %s failed. Expected: %s, got: %s",
-                    format_value(c).c_str(), format_value(expected).c_str(), format_value(result).c_str()
-                );
-            }
-        }
-
-        clReleaseMemObject(img);
-        clReleaseMemObject(coords_buffer);
-        clReleaseMemObject(output_buffer);
-        clReleaseKernel(kernel);
-        clReleaseProgram(program);
-        return error;
-    }
-};
-
-template<cl_mem_object_type ImageType>
-int run_test_cases(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    if (!is_test_supported(device))
-        return CL_SUCCESS;
-
-    int error = CL_SUCCESS;
-
-    for (auto channel_order : get_channel_orders(device))
-    {
-        error = image_test<ImageType, CL_SIGNED_INT8>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_SIGNED_INT16>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_SIGNED_INT32>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-
-        error = image_test<ImageType, CL_UNSIGNED_INT8>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_UNSIGNED_INT16>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_UNSIGNED_INT32>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-
-        error = image_test<ImageType, CL_FLOAT>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-
-AUTO_TEST_CASE(test_images_read_1d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE1D>(device, context, queue, num_elements);
-}
-
-AUTO_TEST_CASE(test_images_read_2d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE2D>(device, context, queue, num_elements);
-}
-
-AUTO_TEST_CASE(test_images_read_3d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE3D>(device, context, queue, num_elements);
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_IMAGES_TEST_READ_HPP
diff --git a/test_conformance/clcpp/images/test_sample.hpp b/test_conformance/clcpp/images/test_sample.hpp
deleted file mode 100644
index a96a563a32..0000000000
--- a/test_conformance/clcpp/images/test_sample.hpp
+++ /dev/null
@@ -1,363 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_IMAGES_TEST_SAMPLE_HPP
-#define TEST_CONFORMANCE_CLCPP_IMAGES_TEST_SAMPLE_HPP
-
-#include <sstream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "common.hpp"
-
-
-namespace test_images_sample {
-
-enum class sampler_source
-{
-    param,
-    program_scope
-};
-
-const sampler_source sampler_sources[] = { sampler_source::param, sampler_source::program_scope };
-
-template<cl_mem_object_type ImageType, cl_channel_type ChannelType>
-struct image_test : image_test_base<ImageType, ChannelType>
-{
-    cl_channel_order channel_order;
-    sampler_source source;
-
-    image_test(cl_channel_order channel_order, sampler_source source) :
-        channel_order(channel_order),
-        source(source)
-    { }
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    std::string generate_source()
-    {
-        std::stringstream s;
-        s << R"(
-        typedef )" << type_name<typename image_test::element_type>() << R"( element_type;
-        )";
-
-        std::string sampler;
-        if (source == sampler_source::program_scope)
-        {
-            s << R"(
-        constant sampler_t sampler_program_scope = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE;
-            )";
-            sampler = "sampler_program_scope";
-        }
-        else if (source == sampler_source::param)
-        {
-            sampler = "sampler_param";
-        }
-
-        s << R"(
-        kernel void test(
-            read_only )" << image_test::image_type_name() << R"(_t img,
-            const global int4 *coords,
-            global element_type *output,
-            sampler_t sampler_param
-        ) {
-            const ulong gid = get_global_linear_id();
-
-            output[gid] = read_image)" << image_test::function_suffix() <<
-                "(img, " << sampler << ", coords[gid]." << image_test::coord_accessor() << R"();
-        }
-        )";
-
-        return s.str();
-    }
-#else
-    std::string generate_source()
-    {
-        std::stringstream s;
-        s << R"(
-        #include <opencl_memory>
-        #include <opencl_common>
-        #include <opencl_work_item>
-        #include <opencl_image>
-        using namespace cl;
-        )";
-
-        s << R"(
-        typedef )" << type_name<typename image_test::element_type>() <<  R"( element_type;
-        )";
-
-        std::string sampler;
-        if (source == sampler_source::program_scope)
-        {
-            s << R"(
-        sampler sampler_program_scope = make_sampler<addressing_mode::none, normalized_coordinates::unnormalized, filtering_mode::nearest>();
-            )";
-            sampler = "sampler_program_scope";
-        }
-        else if (source == sampler_source::param)
-        {
-            sampler = "sampler_param";
-        }
-
-        s << R"(
-        kernel void test(
-            const )" << image_test::image_type_name() << R"(<element_type, image_access::sample> img,
-            const global_ptr<int4[]> coords,
-            global_ptr<element_type[]> output,
-            sampler sampler_param
-        ) {
-            const ulong gid = get_global_linear_id();
-
-            output[gid] = img.sample()" << sampler << ", coords[gid]." << image_test::coord_accessor() << R"();
-        }
-        )";
-
-        return s.str();
-    }
-#endif
-
-    int run(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-    {
-        int error = CL_SUCCESS;
-
-        cl_program program;
-        cl_kernel kernel;
-
-        std::string kernel_name = "test";
-        std::string source = generate_source();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name
-        );
-        RETURN_ON_ERROR(error)
-        return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name, "-cl-std=CL2.0", false
-        );
-        RETURN_ON_ERROR(error)
-// Normal run
-#else
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name
-        );
-        RETURN_ON_ERROR(error)
-#endif
-
-        using element_type = typename image_test::element_type;
-        using coord_type = cl_int4;
-        using scalar_element_type = typename scalar_type<element_type>::type;
-        using channel_type = typename image_test::channel_type;
-
-        cl_image_format image_format;
-        image_format.image_channel_order = channel_order;
-        image_format.image_channel_data_type = ChannelType;
-
-        const size_t pixel_size = get_pixel_size(&image_format);
-        const size_t channel_count = get_channel_order_channel_count(image_format.image_channel_order);
-
-        cl_image_desc image_desc;
-        image_desc.image_type = ImageType;
-        if (ImageType == CL_MEM_OBJECT_IMAGE1D)
-        {
-            image_desc.image_width = 2048;
-            image_desc.image_height = 1;
-            image_desc.image_depth = 1;
-        }
-        else if (ImageType == CL_MEM_OBJECT_IMAGE2D)
-        {
-            image_desc.image_width = 256;
-            image_desc.image_height = 256;
-            image_desc.image_depth = 1;
-        }
-        else if (ImageType == CL_MEM_OBJECT_IMAGE3D)
-        {
-            image_desc.image_width = 64;
-            image_desc.image_height = 64;
-            image_desc.image_depth = 64;
-        }
-        image_desc.image_array_size = 0;
-        image_desc.image_row_pitch = image_desc.image_width * pixel_size;
-        image_desc.image_slice_pitch = image_desc.image_row_pitch * image_desc.image_height;
-        image_desc.num_mip_levels = 0;
-        image_desc.num_samples = 0;
-        image_desc.mem_object = NULL;
-
-        image_descriptor image_info = create_image_descriptor(image_desc, &image_format);
-
-        std::vector<channel_type> image_values = generate_input(
-            image_desc.image_width * image_desc.image_height * image_desc.image_depth * channel_count,
-            image_test::channel_min(), image_test::channel_max(),
-            std::vector<channel_type>()
-        );
-
-        const size_t count = num_elements;
-
-        std::vector<coord_type> coords = generate_input(
-            count,
-            detail::make_value<coord_type>(0),
-            coord_type {
-                static_cast<cl_int>(image_desc.image_width - 1),
-                static_cast<cl_int>(image_desc.image_height - 1),
-                static_cast<cl_int>(image_desc.image_depth - 1),
-                0
-            },
-            std::vector<coord_type>()
-        );
-
-        cl_mem img = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-            &image_format, &image_desc, static_cast<void *>(image_values.data()), &error);
-        RETURN_ON_CL_ERROR(error, "clCreateImage")
-
-        cl_mem coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-            sizeof(coord_type) * count, static_cast<void *>(coords.data()), &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(element_type) * count, NULL, &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        const cl_sampler_properties sampler_properties[] = {
-            CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
-            CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_NONE,
-            CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
-            0
-        };
-        cl_sampler sampler = clCreateSamplerWithProperties(context, sampler_properties, &error);
-        RETURN_ON_CL_ERROR(error, "clCreateSamplerWithProperties")
-
-        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &img);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 1, sizeof(coords_buffer), &coords_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 3, sizeof(sampler), &sampler);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-        const size_t global_size = count;
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
-        RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-        std::vector<element_type> output(count);
-        error = clEnqueueReadBuffer(
-            queue, output_buffer, CL_TRUE,
-            0, sizeof(element_type) * count,
-            static_cast<void *>(output.data()),
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-        for (size_t i = 0; i < count; i++)
-        {
-            const coord_type c = coords[i];
-            const element_type result = output[i];
-
-            element_type expected;
-            read_image_pixel<scalar_element_type>(static_cast<void *>(image_values.data()), &image_info,
-                c.s[0], c.s[1], c.s[2],
-                expected.s);
-
-            if (!are_equal(result, expected))
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "Sampling from coordinates %s failed. Expected: %s, got: %s",
-                    format_value(c).c_str(), format_value(expected).c_str(), format_value(result).c_str()
-                );
-            }
-        }
-
-        clReleaseMemObject(img);
-        clReleaseMemObject(coords_buffer);
-        clReleaseMemObject(output_buffer);
-        clReleaseSampler(sampler);
-        clReleaseKernel(kernel);
-        clReleaseProgram(program);
-        return error;
-    }
-};
-
-template<cl_mem_object_type ImageType>
-int run_test_cases(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    if (!is_test_supported(device))
-        return CL_SUCCESS;
-
-    int error = CL_SUCCESS;
-
-    for (auto channel_order : get_channel_orders(device))
-    for (auto source : sampler_sources)
-    {
-        error = image_test<ImageType, CL_SIGNED_INT8>(channel_order, source)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_SIGNED_INT16>(channel_order, source)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_SIGNED_INT32>(channel_order, source)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-
-        error = image_test<ImageType, CL_UNSIGNED_INT8>(channel_order, source)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_UNSIGNED_INT16>(channel_order, source)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_UNSIGNED_INT32>(channel_order, source)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-
-        error = image_test<ImageType, CL_FLOAT>(channel_order, source)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-
-AUTO_TEST_CASE(test_images_sample_1d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE1D>(device, context, queue, num_elements);
-}
-
-AUTO_TEST_CASE(test_images_sample_2d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE2D>(device, context, queue, num_elements);
-}
-
-AUTO_TEST_CASE(test_images_sample_3d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE3D>(device, context, queue, num_elements);
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_IMAGES_TEST_SAMPLE_HPP
diff --git a/test_conformance/clcpp/images/test_write.hpp b/test_conformance/clcpp/images/test_write.hpp
deleted file mode 100644
index 0f54487456..0000000000
--- a/test_conformance/clcpp/images/test_write.hpp
+++ /dev/null
@@ -1,327 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_IMAGES_TEST_WRITE_HPP
-#define TEST_CONFORMANCE_CLCPP_IMAGES_TEST_WRITE_HPP
-
-#include <algorithm>
-#include <sstream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "common.hpp"
-
-
-namespace test_images_write {
-
-template<cl_mem_object_type ImageType, cl_channel_type ChannelType>
-struct image_test : image_test_base<ImageType, ChannelType>
-{
-    cl_channel_order channel_order;
-
-    image_test(cl_channel_order channel_order) :
-        channel_order(channel_order)
-    { }
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    std::string generate_source()
-    {
-        std::stringstream s;
-        s << R"(
-        typedef )" << type_name<typename image_test::element_type>() << R"( element_type;
-
-        kernel void test(
-            write_only )" << image_test::image_type_name() << R"(_t img,
-            const global int4 *coords,
-            const global element_type *input
-        ) {
-            const ulong gid = get_global_linear_id();
-
-            write_image)" << image_test::function_suffix() <<
-                "(img, coords[gid]." << image_test::coord_accessor() << R"(, input[gid]);
-        }
-        )";
-
-        return s.str();
-    }
-#else
-    std::string generate_source()
-    {
-        std::stringstream s;
-        s << R"(
-        #include <opencl_memory>
-        #include <opencl_common>
-        #include <opencl_work_item>
-        #include <opencl_image>
-        using namespace cl;
-        )";
-
-        s << R"(
-        typedef )" << type_name<typename image_test::element_type>() <<  R"( element_type;
-
-        kernel void test(
-            )" << image_test::image_type_name() << R"(<element_type, image_access::write> img,
-            const global_ptr<int4[]> coords,
-            const global_ptr<element_type[]> input
-        ) {
-            const ulong gid = get_global_linear_id();
-
-            img.write(coords[gid].)" << image_test::coord_accessor() << R"(, input[gid]);
-        }
-        )";
-
-        return s.str();
-    }
-#endif
-
-    int run(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-    {
-        int error = CL_SUCCESS;
-
-        cl_program program;
-        cl_kernel kernel;
-
-        std::string kernel_name = "test";
-        std::string source = generate_source();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name
-        );
-        RETURN_ON_ERROR(error)
-        return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name, "-cl-std=CL2.0", false
-        );
-        RETURN_ON_ERROR(error)
-// Normal run
-#else
-        error = create_opencl_kernel(
-            context, &program, &kernel,
-            source, kernel_name
-        );
-        RETURN_ON_ERROR(error)
-#endif
-
-        using element_type = typename image_test::element_type;
-        using coord_type = cl_int4;
-        using scalar_element_type = typename scalar_type<element_type>::type;
-        using channel_type = typename image_test::channel_type;
-
-        cl_image_format image_format;
-        image_format.image_channel_order = channel_order;
-        image_format.image_channel_data_type = ChannelType;
-
-        const size_t pixel_size = get_pixel_size(&image_format);
-        const size_t channel_count = get_channel_order_channel_count(image_format.image_channel_order);
-
-        cl_image_desc image_desc;
-        image_desc.image_type = ImageType;
-        if (ImageType == CL_MEM_OBJECT_IMAGE1D)
-        {
-            image_desc.image_width = 2048;
-            image_desc.image_height = 1;
-            image_desc.image_depth = 1;
-        }
-        else if (ImageType == CL_MEM_OBJECT_IMAGE2D)
-        {
-            image_desc.image_width = 256;
-            image_desc.image_height = 256;
-            image_desc.image_depth = 1;
-        }
-        else if (ImageType == CL_MEM_OBJECT_IMAGE3D)
-        {
-            image_desc.image_width = 64;
-            image_desc.image_height = 64;
-            image_desc.image_depth = 64;
-        }
-        image_desc.image_array_size = 0;
-        image_desc.image_row_pitch = image_desc.image_width * pixel_size;
-        image_desc.image_slice_pitch = image_desc.image_row_pitch * image_desc.image_height;
-        image_desc.num_mip_levels = 0;
-        image_desc.num_samples = 0;
-        image_desc.mem_object = NULL;
-
-        image_descriptor image_info = create_image_descriptor(image_desc, &image_format);
-
-        std::vector<channel_type> random_image_values = generate_input(
-            image_desc.image_width * image_desc.image_height * image_desc.image_depth * channel_count,
-            image_test::channel_min(), image_test::channel_max(),
-            std::vector<channel_type>()
-        );
-
-        const size_t count = num_elements;
-
-        std::vector<coord_type> coords = generate_input(
-            count,
-            detail::make_value<coord_type>(0),
-            coord_type {
-                static_cast<cl_int>(image_desc.image_width - 1),
-                static_cast<cl_int>(image_desc.image_height - 1),
-                static_cast<cl_int>(image_desc.image_depth - 1),
-                0
-            },
-            std::vector<coord_type>()
-        );
-
-        std::vector<element_type> input(count);
-        for (size_t i = 0; i < count; i++)
-        {
-            const coord_type c = coords[i];
-
-            // Use read_image_pixel from harness/imageHelpers to fill input values
-            // (it will deal with correct channels, orders etc.)
-            read_image_pixel<scalar_element_type>(static_cast<void *>(random_image_values.data()), &image_info,
-                c.s[0], c.s[1], c.s[2],
-                input[i].s);
-        }
-
-        // image_row_pitch and image_slice_pitch must be 0, when clCreateImage is used with host_ptr = NULL
-        image_desc.image_row_pitch = 0;
-        image_desc.image_slice_pitch = 0;
-        cl_mem img = clCreateImage(context, CL_MEM_WRITE_ONLY,
-            &image_format, &image_desc, NULL, &error);
-        RETURN_ON_CL_ERROR(error, "clCreateImage")
-
-        cl_mem coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-            sizeof(coord_type) * count, static_cast<void *>(coords.data()), &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        cl_mem input_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-            sizeof(element_type) * count, static_cast<void *>(input.data()), &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &img);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 1, sizeof(coords_buffer), &coords_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 2, sizeof(input_buffer), &input_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-        const size_t global_size = count;
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
-        RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-        std::vector<channel_type> image_values(image_desc.image_width * image_desc.image_height * image_desc.image_depth * channel_count);
-
-        const size_t origin[3] = { 0 };
-        const size_t region[3] = { image_desc.image_width, image_desc.image_height, image_desc.image_depth };
-        error = clEnqueueReadImage(
-            queue, img, CL_TRUE,
-            origin, region, 0, 0,
-            static_cast<void *>(image_values.data()),
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-        for (size_t i = 0; i < count; i++)
-        {
-            const coord_type c = coords[i];
-            const element_type expected = input[i];
-
-            element_type result;
-            read_image_pixel<scalar_element_type>(static_cast<void *>(image_values.data()), &image_info,
-                c.s[0], c.s[1], c.s[2],
-                result.s);
-
-            if (!are_equal(result, expected))
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "Writing to coordinates %s failed. Expected: %s, got: %s",
-                    format_value(c).c_str(), format_value(expected).c_str(), format_value(result).c_str()
-                );
-            }
-        }
-
-        clReleaseMemObject(img);
-        clReleaseMemObject(coords_buffer);
-        clReleaseMemObject(input_buffer);
-        clReleaseKernel(kernel);
-        clReleaseProgram(program);
-        return error;
-    }
-};
-
-template<cl_mem_object_type ImageType>
-int run_test_cases(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    if (!is_test_supported(device))
-        return CL_SUCCESS;
-
-    int error = CL_SUCCESS;
-
-    for (auto channel_order : get_channel_orders(device))
-    {
-        error = image_test<ImageType, CL_SIGNED_INT8>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_SIGNED_INT16>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_SIGNED_INT32>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-
-        error = image_test<ImageType, CL_UNSIGNED_INT8>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_UNSIGNED_INT16>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-        error = image_test<ImageType, CL_UNSIGNED_INT32>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-
-        error = image_test<ImageType, CL_FLOAT>(channel_order)
-            .run(device, context, queue, num_elements);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-
-AUTO_TEST_CASE(test_images_write_1d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE1D>(device, context, queue, num_elements);
-}
-
-AUTO_TEST_CASE(test_images_write_2d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE2D>(device, context, queue, num_elements);
-}
-
-AUTO_TEST_CASE(test_images_write_3d)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    return run_test_cases<CL_MEM_OBJECT_IMAGE3D>(device, context, queue, num_elements);
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_IMAGES_TEST_WRITE_HPP
diff --git a/test_conformance/clcpp/integer_funcs/24bit_funcs.hpp b/test_conformance/clcpp/integer_funcs/24bit_funcs.hpp
deleted file mode 100644
index 98da450b43..0000000000
--- a/test_conformance/clcpp/integer_funcs/24bit_funcs.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_24BIT_HPP
-#define TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_24BIT_HPP
-
-#include "common.hpp"
-#include <type_traits>
-
-template<class IN1, class IN2, class IN3, class OUT1>
-struct int_func_mad24 : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "mad24";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y, const IN3& z)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, IN3>::value
-                && std::is_same<IN3, OUT1>::value,
-            "All types must be the same"
-        );
-        static_assert(
-            std::is_same<cl_uint, IN1>::value || std::is_same<cl_int, IN1>::value,
-            "Function takes only signed/unsigned integers."
-        );
-        return (x * y) + z;
-    }
-
-    IN1 min1()
-    {
-        return 0;
-    }
-
-    IN1 max1()
-    {
-        return (std::numeric_limits<IN1>::max)() & IN1(0x00FFFF);
-    }
-
-    IN2 min2()
-    {
-        return 0;
-    }
-
-    IN2 max2()
-    {
-        return (std::numeric_limits<IN2>::max)() & IN2(0x00FFFF);
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_mul24 : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "mul24";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, OUT1>::value,
-            "All types must be the same"
-        );
-        static_assert(
-            std::is_same<cl_uint, IN1>::value || std::is_same<cl_int, IN1>::value,
-            "Function takes only signed/unsigned integers."
-        );
-        return x * y;
-    }
-
-    IN1 min1()
-    {
-        return 0;
-    }
-
-    IN1 max1()
-    {
-        return (std::numeric_limits<IN1>::max)() & IN1(0x00FFFF);
-    }
-
-    IN2 min2()
-    {
-        return 0;
-    }
-
-    IN2 max2()
-    {
-        return (std::numeric_limits<IN2>::max)() & IN2(0x00FFFF);
-    }
-};
-
-AUTO_TEST_CASE(test_int_24bit_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-    
-    // intn mad24(intn x, intn y, intn z);
-    // uintn mad24(uintn x, uintn y, uintn z);
-    TEST_TERNARY_FUNC_MACRO((int_func_mad24<cl_int, cl_int, cl_int, cl_int>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_mad24<cl_uint, cl_uint, cl_uint, cl_uint>()))
-
-    // intn mul24(intn x, intn y);
-    // uintn mul24(uintn x, uintn y);
-    TEST_BINARY_FUNC_MACRO((int_func_mul24<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_mul24<cl_uint, cl_uint, cl_uint>()))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_24BIT_HPP
diff --git a/test_conformance/clcpp/integer_funcs/CMakeLists.txt b/test_conformance/clcpp/integer_funcs/CMakeLists.txt
deleted file mode 100644
index ba4cfe865d..0000000000
--- a/test_conformance/clcpp/integer_funcs/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_INTEGER_FUNCS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/integer_funcs/bitwise_funcs.hpp b/test_conformance/clcpp/integer_funcs/bitwise_funcs.hpp
deleted file mode 100644
index 13ca1563d0..0000000000
--- a/test_conformance/clcpp/integer_funcs/bitwise_funcs.hpp
+++ /dev/null
@@ -1,232 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_BITWISE_HPP
-#define TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_BITWISE_HPP
-
-#include "common.hpp"
-#include <type_traits>
-
-template<class IN1, class OUT1>
-struct int_func_popcount : public unary_func<IN1, OUT1>
-{
-    std::string str()
-    {
-        return "popcount";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(IN1 x)
-    {
-        OUT1 count = 0;
-        for (count = 0; x != 0; count++)
-        {
-            x &= x - 1;
-        }
-        return count;
-    }
-};
-
-template<class IN1, class OUT1>
-struct int_func_clz : public unary_func<IN1, OUT1>
-{
-    std::string str()
-    {
-        return "clz";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(IN1 x)
-    {
-        OUT1 count = 0;
-        if(std::is_unsigned<IN1>::value)
-        {
-            cl_ulong value = x;
-            value <<= 8 * sizeof(value) - (8 * sizeof(x));
-            for(count = 0; 0 == (value & (CL_LONG_MIN)); count++)
-            {
-                value <<= 1;
-            }
-        }
-        else
-        {            
-            cl_long value = x;
-            value <<= 8 * sizeof(value) - (8 * sizeof(x));
-            for(count = 0; 0 == (value & (CL_LONG_MIN)); count++)
-            {
-                value <<= 1;
-            }
-        }
-        return count;
-    }
-};
-
-template<class IN1, class OUT1>
-struct int_func_ctz : public unary_func<IN1, OUT1>
-{
-    std::string str()
-    {
-        return "ctz";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(IN1 x)
-    {
-        if(x == 0)
-            return sizeof(x);
-
-        OUT1 count = 0;
-        IN1 value = x;
-        for(count = 0; 0 == (value & 0x1); count++)
-        {
-            value >>= 1;
-        }
-        return count;
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_rotate : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "rotate";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(IN1 value, IN2 shift)
-    {
-        static_assert(
-            std::is_unsigned<IN1>::value,
-            "Only unsigned integers are supported"
-        );
-        if ((shift &= sizeof(value)*8 - 1) == 0)
-            return value;
-        return (value << shift) | (value >> (sizeof(value)*8 - shift));
-    }
-
-    IN2 min2()
-    {
-        return 0;
-    }
-
-    IN2 max2()
-    {
-        return sizeof(IN1) * 8;
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_upsample : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "upsample";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(IN1 hi, IN2 lo)
-    {
-        static_assert(
-            sizeof(IN1) == sizeof(IN2),
-            "sizeof(IN1) != sizeof(IN2)"
-        );
-        static_assert(
-            sizeof(OUT1) == 2 * sizeof(IN1),
-            "sizeof(OUT1) != 2 * sizeof(IN1)"
-        );
-        static_assert(
-            std::is_unsigned<IN2>::value,
-            "IN2 type must be unsigned"
-        );
-        return (static_cast<OUT1>(hi) << (8*sizeof(IN1))) | lo;
-    }
-
-    IN2 min2()
-    {
-        return 0;
-    }
-
-    IN2 max2()
-    {
-        return sizeof(IN1) * 8;
-    }
-};
-
-AUTO_TEST_CASE(test_int_bitwise_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-    
-    TEST_UNARY_FUNC_MACRO((int_func_popcount<cl_int, cl_int>()))
-    TEST_UNARY_FUNC_MACRO((int_func_popcount<cl_uint, cl_uint>()))
-    TEST_UNARY_FUNC_MACRO((int_func_popcount<cl_long, cl_long>()))
-    TEST_UNARY_FUNC_MACRO((int_func_popcount<cl_ulong, cl_ulong>()))
-
-    TEST_UNARY_FUNC_MACRO((int_func_clz<cl_int, cl_int>()))
-    TEST_UNARY_FUNC_MACRO((int_func_clz<cl_uint, cl_uint>()))
-    TEST_UNARY_FUNC_MACRO((int_func_clz<cl_long, cl_long>()))
-    TEST_UNARY_FUNC_MACRO((int_func_clz<cl_ulong, cl_ulong>()))
-
-    TEST_UNARY_FUNC_MACRO((int_func_ctz<cl_int, cl_int>()))
-    TEST_UNARY_FUNC_MACRO((int_func_ctz<cl_uint, cl_uint>()))
-    TEST_UNARY_FUNC_MACRO((int_func_ctz<cl_long, cl_long>()))
-    TEST_UNARY_FUNC_MACRO((int_func_ctz<cl_ulong, cl_ulong>()))
-
-    TEST_BINARY_FUNC_MACRO((int_func_rotate<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_rotate<cl_ulong, cl_ulong, cl_ulong>()))
-
-    // shortn upsample(charn hi, ucharn lo);
-    TEST_BINARY_FUNC_MACRO((int_func_upsample<cl_char, cl_uchar, cl_short>()))
-    // ushortn upsample(ucharn hi, ucharn lo);
-    TEST_BINARY_FUNC_MACRO((int_func_upsample<cl_uchar, cl_uchar, cl_ushort>()))
-    // intn upsample(shortn hi, ushortn lo);
-    TEST_BINARY_FUNC_MACRO((int_func_upsample<cl_short, cl_ushort, cl_int>()))
-    // uintn upsample(ushortn hi, ushortn lo);
-    TEST_BINARY_FUNC_MACRO((int_func_upsample<cl_ushort, cl_ushort, cl_uint>()))
-    // longn upsample(intn hi, uintn lo);
-    TEST_BINARY_FUNC_MACRO((int_func_upsample<cl_int, cl_uint, cl_long>()))
-    // ulongn upsample(uintn hi, uintn lo);
-    TEST_BINARY_FUNC_MACRO((int_func_upsample<cl_uint, cl_uint, cl_ulong>()))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_BITWISE_HPP
diff --git a/test_conformance/clcpp/integer_funcs/common.hpp b/test_conformance/clcpp/integer_funcs/common.hpp
deleted file mode 100644
index f04811e145..0000000000
--- a/test_conformance/clcpp/integer_funcs/common.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_COMMON_HPP
-
-#include <random>
-#include <limits>
-#include <algorithm>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#endif // TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_COMMON_HPP
diff --git a/test_conformance/clcpp/integer_funcs/main.cpp b/test_conformance/clcpp/integer_funcs/main.cpp
deleted file mode 100644
index c6cdfb616f..0000000000
--- a/test_conformance/clcpp/integer_funcs/main.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "bitwise_funcs.hpp"
-#include "numeric_funcs.hpp"
-#include "24bit_funcs.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/integer_funcs/numeric_funcs.hpp b/test_conformance/clcpp/integer_funcs/numeric_funcs.hpp
deleted file mode 100644
index 21d75c5acd..0000000000
--- a/test_conformance/clcpp/integer_funcs/numeric_funcs.hpp
+++ /dev/null
@@ -1,703 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_NUMERIC_HPP
-#define TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_NUMERIC_HPP
-
-#include "common.hpp"
-#include <type_traits>
-
-template<class IN1, class OUT1>
-struct int_func_abs : public unary_func<IN1, OUT1>
-{
-    std::string str()
-    {
-        return "abs";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x)
-    {
-        static_assert(
-            std::is_unsigned<OUT1>::value,
-            "OUT1 type must be unsigned"
-        );
-        if(x < IN1(0))
-            return static_cast<OUT1>(-x);
-        return static_cast<OUT1>(x);
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_abs_diff : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "abs_diff";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value,
-            "IN1 must be IN2"
-        );
-        static_assert(
-            std::is_unsigned<OUT1>::value,
-            "OUT1 type must be unsigned"
-        );
-        if(x < y)
-            return static_cast<OUT1>(y-x);
-        return static_cast<OUT1>(x-y);
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_add_sat : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "add_sat";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value,
-            "IN1 must be IN2"
-        );
-        static_assert(
-            std::is_same<OUT1, IN2>::value,
-            "OUT1 must be IN2"
-        );
-        // sat unsigned integers
-        if(std::is_unsigned<OUT1>::value)
-        {
-            OUT1 z = x + y;
-            if(z < x || z < y)
-                return (std::numeric_limits<OUT1>::max)();
-            return z;
-        }
-        // sat signed integers
-        OUT1 z = x + y;
-        if(y > 0)
-        {
-            if(z < x)
-                return (std::numeric_limits<OUT1>::max)();
-        }
-        else
-        {
-            if(z > x)
-                return (std::numeric_limits<OUT1>::min)();
-        }
-        return z;
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_hadd : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "hadd";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value,
-            "IN1 must be IN2"
-        );
-        static_assert(
-            std::is_same<OUT1, IN2>::value,
-            "OUT1 must be IN2"
-        );
-        return (x >> OUT1(1)) + (y >> OUT1(1)) + (x & y & OUT1(1));
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_rhadd : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "rhadd";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value,
-            "IN1 must be IN2"
-        );
-        static_assert(
-            std::is_same<OUT1, IN2>::value,
-            "OUT1 must be IN2"
-        );
-        return (x >> OUT1(1)) + (y >> OUT1(1)) + ((x | y) & OUT1(1));
-    }
-};
-
-// clamp for scalars
-template<class IN1, class IN2, class IN3, class OUT1, class Enable = void>
-struct int_func_clamp : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "clamp";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& minval, const IN3& maxval)
-    {
-        static_assert(
-            std::is_same<IN2, IN3>::value,
-            "IN3 must be IN2"
-        );
-        static_assert(
-            std::is_same<OUT1, IN1>::value,
-            "OUT1 must be IN1"
-        );
-        return (std::min)((std::max)(x, minval), maxval);
-    }
-
-    IN2 min2()
-    {
-        return (std::numeric_limits<IN2>::min)();
-    }
-
-    IN2 max2()
-    {
-        return (std::numeric_limits<IN2>::max)() / IN2(2);
-    }
-
-    IN3 min3()
-    {
-        return IN3(1) + ((std::numeric_limits<IN3>::max)() / IN3(2));
-    }
-
-    IN3 max3()
-    {
-        return (std::numeric_limits<IN3>::max)();
-    }
-};
-
-// gentype clamp(gentype x, scalar minval, scalar maxval);
-template<class IN1, class IN2, class IN3, class OUT1>
-struct int_func_clamp<IN1, IN2, IN3, OUT1, typename std::enable_if<is_vector_type<OUT1>::value>::type> : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "clamp";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& minval, const IN3& maxval)
-    {
-        static_assert(
-            std::is_same<IN2, IN3>::value,
-            "IN3 must be IN2"
-        );
-        static_assert(
-            !is_vector_type<IN2>::value && !is_vector_type<IN3>::value,
-            "IN3 and IN2 must be scalar"
-        );
-        static_assert(
-            std::is_same<OUT1, IN1>::value,
-            "OUT1 must be IN1"
-        );
-        OUT1 result;
-        for(size_t i = 0; i < vector_size<OUT1>::value; i++)
-        {
-            result.s[i] = (std::min)((std::max)(x.s[i], minval), maxval);
-        }
-        return result;
-    }
-
-    IN1 min1()
-    {
-        typedef typename scalar_type<IN1>::type SCALAR1;
-        IN1 min1;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            min1.s[i] = (std::numeric_limits<SCALAR1>::min)();
-        }
-        return min1;
-    }
-
-    IN1 max1()
-    {
-        typedef typename scalar_type<IN1>::type SCALAR1;
-        IN1 max1;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            max1.s[i] = (std::numeric_limits<SCALAR1>::max)();
-        }
-        return max1;
-    }
-
-    IN2 min2()
-    {
-        return (std::numeric_limits<IN2>::min)();
-    }
-
-    IN2 max2()
-    {
-        return (std::numeric_limits<IN2>::max)() / IN2(2);
-    }
-
-    IN3 min3()
-    {
-        return IN3(1) + ((std::numeric_limits<IN3>::max)() / IN3(2));
-    }
-
-    IN3 max3()
-    {
-        return (std::numeric_limits<IN3>::max)();
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_mul_hi : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "mul_hi";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, OUT1>::value,
-            "Types must be the same"
-        );
-        static_assert(
-            !std::is_same<IN1, cl_long>::value && !std::is_same<IN1, cl_ulong>::value,
-            "Operation unimplemented for 64-bit scalars"
-        );  
-        cl_long xl = static_cast<cl_long>(x);
-        cl_long yl = static_cast<cl_long>(y);
-        return static_cast<OUT1>((xl * yl) >> (8 * sizeof(OUT1)));
-    }
-};
-
-template<class IN1, class IN2, class IN3, class OUT1>
-struct int_func_mad_hi : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "mad_hi";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y, const IN3& z)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, IN3>::value
-                && std::is_same<IN3, OUT1>::value,
-            "Types must be the same"
-        );   
-        return int_func_mul_hi<IN1, IN2, OUT1>()(x, y) + z;
-    }
-};
-
-// This test is implemented only for unsigned integers
-template<class IN1, class IN2, class IN3, class OUT1>
-struct int_func_mad_sat : public ternary_func<IN1, IN2, IN3, OUT1>
-{
-    std::string str()
-    {
-        return "mad_sat";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y, const IN3& z)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value
-                && std::is_same<IN2, IN3>::value
-                && std::is_same<IN3, OUT1>::value,
-            "Types must be the same"
-        );
-        static_assert(
-            std::is_unsigned<OUT1>::value,
-            "Test operation is not implemented for signed integers"
-        );  
-        // mad_sat unsigned integers
-        OUT1 w1 = (x * y);
-        if (x != 0 && w1 / x != y)
-            return (std::numeric_limits<OUT1>::max)();
-        OUT1 w2 = w1 + z;
-        if(w2 < w1)
-            return (std::numeric_limits<OUT1>::max)();
-        return w2;
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_sub_sat : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "sub_sat";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value && std::is_same<IN2, OUT1>::value,
-            "IN1, IN2 and OUT1 must be the same types"
-        );
-        // sat unsigned integers
-        if(std::is_unsigned<OUT1>::value)
-        {
-            OUT1 z = x - y;
-            if(x < y)
-                return (std::numeric_limits<OUT1>::min)();
-            return z;
-        }
-        // sat signed integers
-        OUT1 z = x - y;
-        if(y < 0)
-        {
-            if(z < x)
-                return (std::numeric_limits<OUT1>::max)();
-        }
-        else
-        {
-            if(z > x)
-                return (std::numeric_limits<OUT1>::min)();
-        }
-        return z;
-    }
-};
-
-template<class IN1, class IN2, class OUT1, class Enable = void>
-struct int_func_max : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "max";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value && std::is_same<IN2, OUT1>::value,
-            "IN1, IN2 and OUT1 must be the same types"
-        );
-        return (std::max)(x, y);
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_max<IN1, IN2, OUT1, typename std::enable_if<is_vector_type<OUT1>::value>::type> : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "max";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    IN1 min1()
-    {
-        typedef typename scalar_type<IN1>::type SCALAR1;
-        IN1 min1;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            min1.s[i] = (std::numeric_limits<SCALAR1>::min)();
-        }
-        return min1;
-    }
-
-    IN1 max1()
-    {
-        typedef typename scalar_type<IN1>::type SCALAR1;
-        IN1 max1;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            max1.s[i] = (std::numeric_limits<SCALAR1>::max)();
-        }
-        return max1;
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, OUT1>::value,
-            "IN1 and OUT1 must be the same types"
-        );
-        static_assert(
-            !is_vector_type<IN2>::value,
-            "IN2 must be scalar"
-        );
-        static_assert(
-            std::is_same<typename scalar_type<OUT1>::type, IN2>::value,
-            "IN2 must match with OUT1 and IN1"
-        );
-        IN1 result = x;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            result.s[i] = (std::max)(x.s[i], y);
-        }
-        return result;
-    }
-};
-
-template<class IN1, class IN2, class OUT1, class Enable = void>
-struct int_func_min : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "min";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, IN2>::value && std::is_same<IN2, OUT1>::value,
-            "IN1, IN2 and OUT1 must be the same types"
-        );
-        return (std::min)(x, y);
-    }
-};
-
-template<class IN1, class IN2, class OUT1>
-struct int_func_min<IN1, IN2, OUT1, typename std::enable_if<is_vector_type<OUT1>::value>::type> : public binary_func<IN1, IN2, OUT1>
-{
-    std::string str()
-    {
-        return "min";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_integer>\n";
-    }
-
-    IN1 min1()
-    {
-        typedef typename scalar_type<IN1>::type SCALAR1;
-        IN1 min1;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            min1.s[i] = (std::numeric_limits<SCALAR1>::min)();
-        }
-        return min1;
-    }
-
-    IN1 max1()
-    {
-        typedef typename scalar_type<IN1>::type SCALAR1;
-        IN1 max1;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            max1.s[i] = (std::numeric_limits<SCALAR1>::max)();
-        }
-        return max1;
-    }
-
-    OUT1 operator()(const IN1& x, const IN2& y)
-    {
-        static_assert(
-            std::is_same<IN1, OUT1>::value,
-            "IN1 and OUT1 must be the same types"
-        );
-        static_assert(
-            !is_vector_type<IN2>::value,
-            "IN2 must be scalar"
-        );
-        static_assert(
-            std::is_same<typename scalar_type<OUT1>::type, IN2>::value,
-            "IN2 must match with OUT1 and IN1"
-        );
-        IN1 result = x;
-        for(size_t i = 0; i < vector_size<IN1>::value; i++)
-        {
-            result.s[i] = (std::min)(x.s[i], y);
-        }
-        return result;
-    }
-};
-
-AUTO_TEST_CASE(test_int_numeric_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // ugentype abs(gentype x);
-    TEST_UNARY_FUNC_MACRO((int_func_abs<cl_int, cl_uint>()))
-    TEST_UNARY_FUNC_MACRO((int_func_abs<cl_uint, cl_uint>()))
-    TEST_UNARY_FUNC_MACRO((int_func_abs<cl_long, cl_ulong>()))
-    TEST_UNARY_FUNC_MACRO((int_func_abs<cl_ulong, cl_ulong>()))
-
-    // ugentype abs_diff(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_abs_diff<cl_int, cl_int, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_abs_diff<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_abs_diff<cl_long, cl_long, cl_ulong>()))
-    TEST_BINARY_FUNC_MACRO((int_func_abs_diff<cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype add_sat(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_add_sat<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_add_sat<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_add_sat<cl_long, cl_long, cl_long>()))
-    TEST_BINARY_FUNC_MACRO((int_func_add_sat<cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype hadd(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_hadd<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_hadd<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_hadd<cl_long, cl_long, cl_long>()))
-    TEST_BINARY_FUNC_MACRO((int_func_hadd<cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype rhadd(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_rhadd<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_rhadd<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_rhadd<cl_long, cl_long, cl_long>()))
-    TEST_BINARY_FUNC_MACRO((int_func_rhadd<cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype clamp(gentype x, gentype minval, gentype maxval);
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_int, cl_int, cl_int, cl_int>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_uint, cl_uint, cl_uint, cl_uint>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_long, cl_long, cl_long, cl_long>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_ulong, cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype clamp(gentype x, scalar minval, scalar maxval);
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_int2, cl_int, cl_int, cl_int2>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_uint4, cl_uint, cl_uint, cl_uint4>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_long8, cl_long, cl_long, cl_long8>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_clamp<cl_ulong16, cl_ulong, cl_ulong, cl_ulong16>()))
-
-    // gentype mad_hi(gentype a, gentype b, gentype c);
-    TEST_TERNARY_FUNC_MACRO((int_func_mad_hi<cl_short, cl_short, cl_short, cl_short>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_mad_hi<cl_ushort, cl_ushort, cl_ushort, cl_ushort>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_mad_hi<cl_int, cl_int, cl_int, cl_int>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_mad_hi<cl_uint, cl_uint, cl_uint, cl_uint>()))
-
-    // gentype mad_sat(gentype a, gentype b, gentype c);
-    TEST_TERNARY_FUNC_MACRO((int_func_mad_sat<cl_ushort, cl_ushort, cl_ushort, cl_ushort>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_mad_sat<cl_uint, cl_uint, cl_uint, cl_uint>()))
-    TEST_TERNARY_FUNC_MACRO((int_func_mad_sat<cl_ulong, cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype max(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_long, cl_long, cl_long>()))
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype max(gentype x, scalar y);
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_int2, cl_int, cl_int2>()))
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_uint4, cl_uint, cl_uint4>()))
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_long8, cl_long, cl_long8>()))
-    TEST_BINARY_FUNC_MACRO((int_func_max<cl_ulong16, cl_ulong, cl_ulong16>()))
-
-    // gentype min(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_long, cl_long, cl_long>()))
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_ulong, cl_ulong, cl_ulong>()))
-
-    // gentype min(gentype x, scalar y);
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_int2, cl_int, cl_int2>()))
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_uint4, cl_uint, cl_uint4>()))
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_long8, cl_long, cl_long8>()))
-    TEST_BINARY_FUNC_MACRO((int_func_min<cl_ulong16, cl_ulong, cl_ulong16>()))
-
-    // gentype mul_hi(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_mul_hi<cl_short, cl_short, cl_short>()))
-    TEST_BINARY_FUNC_MACRO((int_func_mul_hi<cl_ushort, cl_ushort, cl_ushort>())) 
-    TEST_BINARY_FUNC_MACRO((int_func_mul_hi<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_mul_hi<cl_uint, cl_uint, cl_uint>()))
-
-    // gentype sub_sat(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((int_func_sub_sat<cl_int, cl_int, cl_int>()))
-    TEST_BINARY_FUNC_MACRO((int_func_sub_sat<cl_uint, cl_uint, cl_uint>()))
-    TEST_BINARY_FUNC_MACRO((int_func_sub_sat<cl_long, cl_long, cl_long>()))
-    TEST_BINARY_FUNC_MACRO((int_func_sub_sat<cl_ulong, cl_ulong, cl_ulong>()))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_INTEGER_FUNCS_NUMERIC_HPP
diff --git a/test_conformance/clcpp/math_funcs/CMakeLists.txt b/test_conformance/clcpp/math_funcs/CMakeLists.txt
deleted file mode 100644
index c3b56c1717..0000000000
--- a/test_conformance/clcpp/math_funcs/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_MATH_FUNCS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/math_funcs/common.hpp b/test_conformance/clcpp/math_funcs/common.hpp
deleted file mode 100644
index 32249056d0..0000000000
--- a/test_conformance/clcpp/math_funcs/common.hpp
+++ /dev/null
@@ -1,347 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_COMMON_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_COMMON_FUNCS_HPP
-
-#include <cmath>
-#include <limits>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include "reference.hpp"
-
-#ifndef MATH_FUNCS_CLASS_NAME
-    #define MATH_FUNCS_CLASS_NAME(x, y) x ## _func_ ## y        
-#endif 
-
-#define MATH_FUNCS_DEFINE_UNARY_FUNC1(GROUP_NAME, NAME, OCL_FUNC, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1) \
-struct MATH_FUNCS_CLASS_NAME(GROUP_NAME, NAME) : public unary_func<cl_float, cl_float> \
-{ \
-    MATH_FUNCS_CLASS_NAME(GROUP_NAME, NAME)(bool is_embedded) : m_is_embedded(is_embedded)  \
-    { \
-    \
-    } \
-    \
-    std::string str() \
-    { \
-        return #OCL_FUNC; \
-    } \
-    \
-    std::string headers()  \
-    { \
-        return "#include <opencl_math>\n"; \
-    } \
-    /* Reference value type is cl_double */ \
-    cl_double operator()(const cl_float& x)  \
-    { \
-        return (HOST_FUNC)(static_cast<cl_double>(x)); \
-    } \
-    \
-    cl_float min1() \
-    { \
-        return MIN1; \
-    } \
-    \
-    cl_float max1() \
-    { \
-        return MAX1; \
-    } \
-    \
-    std::vector<cl_float> in1_special_cases() \
-    { \
-        return {  \
-            cl_float(0.0f), \
-            cl_float(-0.0f), \
-            cl_float(1.0f), \
-            cl_float(-1.0f), \
-            cl_float(2.0f), \
-            cl_float(-2.0f), \
-            std::numeric_limits<cl_float>::infinity(), \
-            -std::numeric_limits<cl_float>::infinity(), \
-            std::numeric_limits<cl_float>::quiet_NaN() \
-        }; \
-    } \
-    \
-    bool use_ulp() \
-    { \
-        return USE_ULP; \
-    } \
-    \
-    template<class T> \
-    typename make_vector_type<cl_double, vector_size<T>::value>::type \
-    delta(const cl_float& in1, const T& expected) \
-    { \
-        typedef  \
-            typename make_vector_type<cl_double, vector_size<T>::value>::type \
-            delta_vector_type; \
-        (void) in1; \
-        auto e = detail::make_value<delta_vector_type>(DELTA); \
-        return detail::multiply<delta_vector_type>(e, expected); \
-    } \
-    \
-    float ulp() \
-    { \
-        if(m_is_embedded) \
-        { \
-            return ULP_EMBEDDED; \
-        } \
-        return ULP; \
-    } \
-private: \
-    bool m_is_embedded; \
-};
-
-#define MATH_FUNCS_DEFINE_BINARY_FUNC1(GROUP_NAME, NAME, OCL_NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1, MIN2, MAX2) \
-struct MATH_FUNCS_CLASS_NAME(GROUP_NAME, NAME) : public binary_func<cl_float, cl_float, cl_float> \
-{ \
-    MATH_FUNCS_CLASS_NAME(GROUP_NAME, NAME)(bool is_embedded) : m_is_embedded(is_embedded)  \
-    { \
-    \
-    } \
-    \
-    std::string str() \
-    { \
-        return #OCL_NAME; \
-    } \
-    \
-    std::string headers()  \
-    { \
-        return "#include <opencl_math>\n"; \
-    } \
-    \
-    cl_float operator()(const cl_float& x, const cl_float& y)  \
-    { \
-        return (HOST_FUNC)(x, y); \
-    } \
-    \
-    cl_float min1() \
-    { \
-        return MIN1; \
-    } \
-    \
-    cl_float max1() \
-    { \
-        return MAX1; \
-    } \
-    \
-    cl_float min2() \
-    { \
-        return MIN2; \
-    } \
-    \
-    cl_float max2() \
-    { \
-        return MAX2; \
-    } \
-    \
-    std::vector<cl_float> in1_special_cases() \
-    { \
-        return {  \
-            cl_float(0.0f), \
-            cl_float(-0.0f), \
-            cl_float(1.0f), \
-            cl_float(-1.0f), \
-            cl_float(2.0f), \
-            cl_float(-2.0f), \
-            std::numeric_limits<cl_float>::infinity(), \
-            -std::numeric_limits<cl_float>::infinity(), \
-            std::numeric_limits<cl_float>::quiet_NaN() \
-        }; \
-    } \
-    \
-    std::vector<cl_float> in2_special_cases() \
-    { \
-        return {  \
-            cl_float(0.0f), \
-            cl_float(-0.0f), \
-            cl_float(1.0f), \
-            cl_float(-1.0f), \
-            cl_float(2.0f), \
-            cl_float(-2.0f), \
-            std::numeric_limits<cl_float>::infinity(), \
-            -std::numeric_limits<cl_float>::infinity(), \
-            std::numeric_limits<cl_float>::quiet_NaN() \
-        }; \
-    } \
-    \
-    template<class T> \
-    typename make_vector_type<cl_double, vector_size<T>::value>::type \
-    delta(const cl_float& in1, const cl_float& in2, const T& expected) \
-    { \
-        typedef \
-            typename make_vector_type<cl_double, vector_size<T>::value>::type \
-            delta_vector_type; \
-        (void) in1; \
-        (void) in2; \
-        auto e = detail::make_value<delta_vector_type>(DELTA); \
-        return detail::multiply<delta_vector_type>(e, expected); \
-    } \
-    \
-    bool use_ulp() \
-    { \
-        return USE_ULP; \
-    } \
-    \
-    float ulp() \
-    { \
-        if(m_is_embedded) \
-        { \
-            return ULP_EMBEDDED; \
-        } \
-        return ULP; \
-    } \
-private: \
-    bool m_is_embedded; \
-};
-
-#define MATH_FUNCS_DEFINE_TERNARY_FUNC1(GROUP_NAME, NAME, OCL_NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1, MIN2, MAX2, MIN3, MAX3) \
-struct MATH_FUNCS_CLASS_NAME(GROUP_NAME, NAME) : public ternary_func<cl_float, cl_float, cl_float, cl_float> \
-{ \
-    MATH_FUNCS_CLASS_NAME(GROUP_NAME, NAME)(bool is_embedded) : m_is_embedded(is_embedded)  \
-    { \
-    \
-    } \
-    \
-    std::string str() \
-    { \
-        return #OCL_NAME; \
-    } \
-    \
-    std::string headers() \
-    { \
-        return "#include <opencl_math>\n"; \
-    } \
-    \
-    cl_double operator()(const cl_float& x, const cl_float& y, const cl_float& z)  \
-    { \
-        return (HOST_FUNC)(static_cast<cl_double>(x), static_cast<cl_double>(y), static_cast<cl_double>(z)); \
-    } \
-    \
-    cl_float min1() \
-    { \
-        return MIN1; \
-    } \
-    \
-    cl_float max1() \
-    { \
-        return MAX1; \
-    } \
-    \
-    cl_float min2() \
-    { \
-        return MIN2; \
-    } \
-    \
-    cl_float max2() \
-    { \
-        return MAX2; \
-    } \
-    \
-    cl_float min3() \
-    { \
-        return MIN3; \
-    } \
-    \
-    cl_float max3() \
-    { \
-        return MAX3; \
-    } \
-    \
-    std::vector<cl_float> in1_special_cases() \
-    { \
-        return {  \
-            cl_float(0.0f), \
-            cl_float(-0.0f), \
-            cl_float(1.0f), \
-            cl_float(-1.0f), \
-            cl_float(2.0f), \
-            cl_float(-2.0f), \
-            std::numeric_limits<cl_float>::infinity(), \
-            -std::numeric_limits<cl_float>::infinity(), \
-            std::numeric_limits<cl_float>::quiet_NaN() \
-        }; \
-    } \
-    \
-    std::vector<cl_float> in2_special_cases() \
-    { \
-        return {  \
-            cl_float(0.0f), \
-            cl_float(-0.0f), \
-            cl_float(1.0f), \
-            cl_float(-1.0f), \
-            cl_float(2.0f), \
-            cl_float(-2.0f), \
-            std::numeric_limits<cl_float>::infinity(), \
-            -std::numeric_limits<cl_float>::infinity(), \
-            std::numeric_limits<cl_float>::quiet_NaN() \
-        }; \
-    } \
-    \
-    std::vector<cl_float> in3_special_cases() \
-    { \
-        return {  \
-            cl_float(0.0f), \
-            cl_float(-0.0f), \
-            cl_float(1.0f), \
-            cl_float(-1.0f), \
-            cl_float(2.0f), \
-            cl_float(-2.0f), \
-            std::numeric_limits<cl_float>::infinity(), \
-            -std::numeric_limits<cl_float>::infinity(), \
-            std::numeric_limits<cl_float>::quiet_NaN() \
-        }; \
-    } \
-    \
-    template<class T> \
-    typename make_vector_type<cl_double, vector_size<T>::value>::type \
-    delta(const cl_float& in1, const cl_float& in2, const cl_float& in3, const T& expected) \
-    { \
-        typedef \
-            typename make_vector_type<cl_double, vector_size<T>::value>::type \
-            delta_vector_type; \
-        (void) in1; \
-        (void) in2; \
-        (void) in3; \
-        auto e = detail::make_value<delta_vector_type>(DELTA); \
-        return detail::multiply<delta_vector_type>(e, expected); \
-    } \
-    \
-    bool use_ulp() \
-    { \
-        return USE_ULP; \
-    } \
-    \
-    float ulp() \
-    { \
-        if(m_is_embedded) \
-        { \
-            return ULP_EMBEDDED; \
-        } \
-        return ULP; \
-    } \
-private: \
-    bool m_is_embedded; \
-};
-
-#define MATH_FUNCS_DEFINE_UNARY_FUNC(GROUP_NAME, NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1) \
-    MATH_FUNCS_DEFINE_UNARY_FUNC1(GROUP_NAME, NAME, NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1)
-#define MATH_FUNCS_DEFINE_BINARY_FUNC(GROUP_NAME, NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1, MIN2, MAX2) \
-    MATH_FUNCS_DEFINE_BINARY_FUNC1(GROUP_NAME, NAME, NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1, MIN2, MAX2)
-#define MATH_FUNCS_DEFINE_TERNARY_FUNC(GROUP_NAME, NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1, MIN2, MAX2, MIN3, MAX3) \
-    MATH_FUNCS_DEFINE_TERNARY_FUNC1(GROUP_NAME, NAME, NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, DELTA, MIN1, MAX1, MIN2, MAX2, MIN3, MAX3)
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_COMMON_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/comparison_funcs.hpp b/test_conformance/clcpp/math_funcs/comparison_funcs.hpp
deleted file mode 100644
index 0bd6ff9196..0000000000
--- a/test_conformance/clcpp/math_funcs/comparison_funcs.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_COMPARISON_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_COMPARISON_FUNCS_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_BINARY_FUNC(comparison, fdim, std::fdim, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(comparison, fmax, std::fmax, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(comparison, fmin, std::fmin, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(comparison, maxmag, reference::maxmag, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(comparison, minmag, reference::minmag, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f, -1000.0f, 1000.0f)
-
-// comparison functions
-AUTO_TEST_CASE(test_comparison_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    last_error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(last_error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    TEST_BINARY_FUNC_MACRO((comparison_func_fdim(is_embedded_profile)))
-    TEST_BINARY_FUNC_MACRO((comparison_func_fmax(is_embedded_profile)))
-    TEST_BINARY_FUNC_MACRO((comparison_func_fmin(is_embedded_profile)))
-    TEST_BINARY_FUNC_MACRO((comparison_func_maxmag(is_embedded_profile)))
-    TEST_BINARY_FUNC_MACRO((comparison_func_minmag(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_COMPARISON_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/exponential_funcs.hpp b/test_conformance/clcpp/math_funcs/exponential_funcs.hpp
deleted file mode 100644
index 82a8247a4e..0000000000
--- a/test_conformance/clcpp/math_funcs/exponential_funcs.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_EXP_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_EXP_FUNCS_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC(exponential, exp, std::exp, true, 3.0f, 4.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(exponential, expm1, std::expm1, true, 3.0f, 4.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(exponential, exp2, std::exp2, true, 3.0f, 4.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(exponential, exp10, reference::exp10, true, 3.0f, 4.0f, 0.001f, -1000.0f, 1000.0f)
-
-struct exponential_func_ldexp : public binary_func<cl_float, cl_int, cl_float>
-{
-    exponential_func_ldexp(bool is_embedded) : m_is_embedded(is_embedded) 
-    {
-   
-    }
-   
-    std::string str()
-    {
-        return "ldexp";
-    }
-   
-    std::string headers() 
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    /* Reference value type is cl_double */
-    cl_double operator()(const cl_float& x, const cl_int& y) 
-    {
-        return (std::ldexp)(static_cast<cl_double>(x), y);
-    }
-   
-    cl_float min1()
-    {
-        return -1000.0f;
-    }
-   
-    cl_float max1()
-    {
-        return 1000.0f;
-    }
-
-    cl_int min2()
-    {
-        return -8;
-    }
-   
-    cl_int max2()
-    {
-        return 8;
-    }
-   
-    std::vector<cl_float> in1_special_cases()
-    {
-        return { 
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            cl_float(2.0f),
-            cl_float(-2.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-   
-    bool use_ulp()
-    {
-        return true;
-    }
-   
-    float ulp()
-    {
-        if(m_is_embedded)
-        {
-            return 0.0f;
-        }
-        return 0.0f;
-    }
-private:
-    bool m_is_embedded;
-};
-
-// exponential functions
-AUTO_TEST_CASE(test_exponential_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    last_error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(last_error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    // auto exp(gentype x);
-    // auto expm1(gentype x);
-    // auto exp2(gentype x);
-    // auto exp10(gentype x);
-    TEST_UNARY_FUNC_MACRO((exponential_func_exp(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((exponential_func_expm1(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((exponential_func_exp2(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((exponential_func_exp10(is_embedded_profile)))
-
-    // auto ldexp(gentype x, intn k);
-    TEST_BINARY_FUNC_MACRO((exponential_func_ldexp(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_EXP_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/floating_point_funcs.hpp b/test_conformance/clcpp/math_funcs/floating_point_funcs.hpp
deleted file mode 100644
index 63b4c23047..0000000000
--- a/test_conformance/clcpp/math_funcs/floating_point_funcs.hpp
+++ /dev/null
@@ -1,733 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_FP_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_FP_FUNCS_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-// -------------- UNARY FUNCTIONS
-
-// gentype ceil(gentype x);
-// gentype floor(gentype x);
-// gentype rint(gentype x);
-// gentype round(gentype x);
-// gentype trunc(gentype x);
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC(fp, ceil, std::ceil, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(fp, floor, std::floor, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(fp, rint, std::rint, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(fp, round, std::round, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(fp, trunc, std::trunc, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f)
-
-// floatn nan(uintn nancode);
-struct fp_func_nan : public unary_func<cl_uint, cl_float>
-{
-    std::string str()
-    {
-        return "nan";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    cl_float operator()(const cl_uint& x)
-    {
-        cl_uint r = x | 0x7fc00000U;
-        // cl_float and cl_int have the same size so that's correct
-        cl_float rf = *reinterpret_cast<cl_float*>(&r);
-        return rf;
-    }
-
-    cl_uint min1()
-    {
-        return 0;
-    }
-
-    cl_uint max1()
-    {
-        return 100;
-    }
-
-    std::vector<cl_uint> in1_special_cases()
-    {
-        return {
-            0, 1
-        };
-    }
-};
-
-// -------------- UNARY FUNCTIONS, 2ND ARG IS POINTER
-
-// gentype fract(gentype x, gentype* iptr);
-//
-// Fuction fract() returns additional value via pointer (2nd argument). In order to test
-// if it's correct output buffer type is cl_float2. In first compontent we store what
-// fract() function returns, and in the 2nd component we store what is returned via its
-// 2nd argument (gentype* iptr).
-struct fp_func_fract : public unary_func<cl_float, cl_float2>
-{
-    fp_func_fract(bool is_embedded) : m_is_embedded(is_embedded)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "fract";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    cl_double2 operator()(const cl_float& x)
-    {
-        return reference::fract(static_cast<cl_double>(x));
-    }
-
-    cl_float min1()
-    {
-        return -1000.0f;
-    }
-
-    cl_float max1()
-    {
-        return 1000.0f;
-    }
-
-    std::vector<cl_float> in1_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            cl_float(2.0f),
-            cl_float(-2.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-
-    bool use_ulp()
-    {
-        return true;
-    }
-
-    float ulp()
-    {
-        if(m_is_embedded)
-        {
-            return 0.0f;
-        }
-        return 0.0f;
-    }
-private:
-    bool m_is_embedded;
-};
-
-// We need to specialize generate_kernel_unary<>() function template for fp_func_fract.
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <>
-std::string generate_kernel_unary<fp_func_fract, cl_float, cl_float2>(fp_func_fract func)
-{
-    return
-        "__kernel void test_fract(global float *input, global float2 *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    float itpr = 0;\n"
-        "    result.x = fract(input[gid], &itpr);\n"
-        "    result.y = itpr;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#else
-template <>
-std::string generate_kernel_unary<fp_func_fract, cl_float, cl_float2>(fp_func_fract func)
-{
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_fract(global_ptr<float[]> input, global_ptr<float2[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    float itpr = 0;\n"
-        "    result.x = fract(input[gid], &itpr);\n"
-        "    result.y = itpr;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#endif
-
-// gentype modf(gentype x, gentype* iptr);
-//
-// Fuction modf() returns additional value via pointer (2nd argument). In order to test
-// if it's correct output buffer type is cl_float2. In first compontent we store what
-// modf() function returns, and in the 2nd component we store what is returned via its
-// 2nd argument (gentype* iptr).
-struct fp_func_modf : public unary_func<cl_float, cl_float2>
-{
-    fp_func_modf(bool is_embedded) : m_is_embedded(is_embedded)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "modf";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    cl_double2 operator()(const cl_float& x)
-    {
-        cl_double2 r;
-        r.s[0] = (std::modf)(static_cast<cl_double>(x), &(r.s[1]));
-        return r;
-    }
-
-    cl_float min1()
-    {
-        return -1000.0f;
-    }
-
-    cl_float max1()
-    {
-        return 1000.0f;
-    }
-
-    std::vector<cl_float> in1_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            cl_float(2.0f),
-            cl_float(-2.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-
-    bool use_ulp()
-    {
-        return true;
-    }
-
-    float ulp()
-    {
-        if(m_is_embedded)
-        {
-            return 0.0f;
-        }
-        return 0.0f;
-    }
-private:
-    bool m_is_embedded;
-};
-
-// We need to specialize generate_kernel_unary<>() function template for fp_func_modf.
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <>
-std::string generate_kernel_unary<fp_func_modf, cl_float, cl_float2>(fp_func_modf func)
-{
-    return
-        "__kernel void test_modf(global float *input, global float2 *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    float itpr = 0;\n"
-        "    result.x = modf(input[gid], &itpr);\n"
-        "    result.y = itpr;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#else
-template <>
-std::string generate_kernel_unary<fp_func_modf, cl_float, cl_float2>(fp_func_modf func)
-{
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_modf(global_ptr<float[]> input, global_ptr<float2[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    float itpr = 0;\n"
-        "    result.x = modf(input[gid], &itpr);\n"
-        "    result.y = itpr;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#endif
-
-// gentype frexp(gentype x, intn* exp);
-//
-// Fuction frexp() returns additional value via pointer (2nd argument). In order to test
-// if it's correct output buffer type is cl_float2. In first compontent we store what
-// modf() function returns, and in the 2nd component we store what is returned via its
-// 2nd argument (intn* exp).
-struct fp_func_frexp : public unary_func<cl_float, cl_float2>
-{
-    fp_func_frexp(bool is_embedded) : m_is_embedded(is_embedded)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "frexp";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    cl_double2 operator()(const cl_float& x)
-    {
-        cl_double2 r;
-        cl_int e;
-        r.s[0] = (std::frexp)(static_cast<cl_double>(x), &e);
-        r.s[1] = static_cast<cl_float>(e);
-        return r;
-    }
-
-    cl_float min1()
-    {
-        return -1000.0f;
-    }
-
-    cl_float max1()
-    {
-        return 1000.0f;
-    }
-
-    std::vector<cl_float> in1_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            cl_float(2.0f),
-            cl_float(-2.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-
-    bool use_ulp()
-    {
-        return true;
-    }
-
-    float ulp()
-    {
-        if(m_is_embedded)
-        {
-            return 0.0f;
-        }
-        return 0.0f;
-    }
-private:
-    bool m_is_embedded;
-};
-
-// We need to specialize generate_kernel_unary<>() function template for fp_func_frexp.
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <>
-std::string generate_kernel_unary<fp_func_frexp, cl_float, cl_float2>(fp_func_frexp func)
-{
-    return
-        "__kernel void test_frexp(global float *input, global float2 *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    int itpr = 0;\n"
-        "    result.x = frexp(input[gid], &itpr);\n"
-        "    result.y = itpr;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#else
-template <>
-std::string generate_kernel_unary<fp_func_frexp, cl_float, cl_float2>(fp_func_frexp func)
-{
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_frexp(global_ptr<float[]> input, global_ptr<float2[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    int itpr = 0;\n"
-        "    result.x = frexp(input[gid], &itpr);\n"
-        "    result.y = itpr;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#endif
-
-// -------------- BINARY FUNCTIONS
-
-// gentype copysign(gentype x, gentype y);
-// gentype fmod(gentype x, gentype y);
-// gentype remainder(gentype x, gentype y);
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2
-MATH_FUNCS_DEFINE_BINARY_FUNC(fp, copysign, std::copysign, true, 0.0f, 0.0f, 0.001f, -100.0f, 100.0f, -10.0f, 10.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(fp, fmod, std::fmod, true, 0.0f, 0.0f, 0.001f, -100.0f, 100.0f, -10.0f, 10.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(fp, remainder, std::remainder, true, 0.0f, 0.001f, 0.0f, -100.0f, 100.0f, -10.0f, 10.0f)
-
-// In case of function float nextafter(float, float) reference function must
-// operate on floats and return float.
-struct fp_func_nextafter : public binary_func<cl_float, cl_float, cl_float>
-{
-    fp_func_nextafter(bool is_embedded) : m_is_embedded(is_embedded)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "nextafter";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    /* In this case reference value type MUST BE cl_float */
-    cl_float operator()(const cl_float& x, const cl_float& y)
-    {
-        return (std::nextafter)(x, y);
-    }
-
-    cl_float min1()
-    {
-        return -1000.0f;
-    }
-
-    cl_float max1()
-    {
-        return 500.0f;
-    }
-
-    cl_float min2()
-    {
-        return 501.0f;
-    }
-
-    cl_float max2()
-    {
-        return 1000.0f;
-    }
-
-    std::vector<cl_float> in1_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            cl_float(2.0f),
-            cl_float(-2.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-
-    std::vector<cl_float> in2_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            cl_float(2.0f),
-            cl_float(-2.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-
-    bool use_ulp()
-    {
-        return true;
-    }
-
-    float ulp()
-    {
-        if(m_is_embedded)
-        {
-            return 0.0f;
-        }
-        return 0.0f;
-    }
-private:
-    bool m_is_embedded;
-};
-
-// gentype remquo(gentype x, gentype y, intn* quo);
-struct fp_func_remquo : public binary_func<cl_float, cl_float, cl_float2>
-{
-    fp_func_remquo(bool is_embedded) : m_is_embedded(is_embedded)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "remquo";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    cl_double2 operator()(const cl_float& x, const cl_float& y)
-    {
-        return reference::remquo(static_cast<cl_double>(x), static_cast<cl_double>(y));
-    }
-
-    cl_float min1()
-    {
-        return -1000.0f;
-    }
-
-    cl_float max1()
-    {
-        return 1000.0f;
-    }
-
-    cl_float min2()
-    {
-        return -1000.0f;
-    }
-
-    cl_float max2()
-    {
-        return 1000.0f;
-    }
-
-    std::vector<cl_float> in1_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-
-    std::vector<cl_float> in2_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-
-    bool use_ulp()
-    {
-        return true;
-    }
-
-    float ulp()
-    {
-        if(m_is_embedded)
-        {
-            return 0.0f;
-        }
-        return 0.0f;
-    }
-private:
-    bool m_is_embedded;
-};
-
-
-// We need to specialize generate_kernel_binary<>() function template for fp_func_remquo.
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <>
-std::string generate_kernel_binary<fp_func_remquo, cl_float, cl_float, cl_float2>(fp_func_remquo func)
-{
-    return
-        "__kernel void test_remquo(global float *input1, global float *input2, global float2 *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    int quo = 0;\n"
-        "    int sign = 0;\n"
-        "    result.x = remquo(input1[gid], input2[gid], &quo);\n"
-        // Specification say:
-        // "remquo also calculates the lower seven bits of the integral quotient x/y,
-        // and gives that value the same sign as x/y. It stores this signed value in
-        // the object pointed to by quo."
-        // Implemenation may save into quo more than seven bits. We need to take
-        // care of that here.
-        "    sign = (quo < 0) ? -1 : 1;\n"
-        "    quo = (quo < 0) ? -quo : quo;\n"
-        "    quo &= 0x0000007f;\n"
-        "    result.y = (sign < 0) ? -quo : quo;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#else
-template <>
-std::string generate_kernel_binary<fp_func_remquo, cl_float, cl_float, cl_float2>(fp_func_remquo func)
-{
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_remquo(global_ptr<float[]> input1, global_ptr<float[]> input2, global_ptr<float2[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 result;\n"
-        "    int quo = 0;\n"
-        "    int sign = 0;\n"
-        "    result.x = remquo(input1[gid], input2[gid], &quo);\n"
-        // Specification say:
-        // "remquo also calculates the lower seven bits of the integral quotient x/y,
-        // and gives that value the same sign as x/y. It stores this signed value in
-        // the object pointed to by quo."
-        // Implemenation may save into quo more than seven bits. We need to take
-        // care of that here.
-        "    sign = (quo < 0) ? -1 : 1;\n"
-        "    quo = (quo < 0) ? -quo : quo;\n"
-        "    quo &= 0x0000007f;\n"
-        "    result.y = (sign < 0) ? -quo : quo;\n"
-        "    output[gid] = result;\n"
-        "}\n";
-}
-#endif
-
-// -------------- TERNARY FUNCTIONS
-
-// gentype fma(gentype a, gentype b, gentype c);
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2, min3, max3
-MATH_FUNCS_DEFINE_TERNARY_FUNC(fp, fma, std::fma, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f, -1000.0f, 1000.0f, -1000.0f, 1000.0f)
-
-// floating point functions
-AUTO_TEST_CASE(test_fp_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    last_error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(last_error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    // gentype ceil(gentype x);
-    TEST_UNARY_FUNC_MACRO((fp_func_ceil(is_embedded_profile)))
-    // gentype floor(gentype x);
-    TEST_UNARY_FUNC_MACRO((fp_func_floor(is_embedded_profile)))
-    // gentype rint(gentype x);
-    TEST_UNARY_FUNC_MACRO((fp_func_rint(is_embedded_profile)))
-    // gentype round(gentype x);
-    TEST_UNARY_FUNC_MACRO((fp_func_round(is_embedded_profile)))
-    // gentype trunc(gentype x);
-    TEST_UNARY_FUNC_MACRO((fp_func_trunc(is_embedded_profile)))
-
-    // floatn nan(uintn nancode);
-    TEST_UNARY_FUNC_MACRO((fp_func_nan()))
-
-    // gentype fract(gentype x, gentype* iptr);
-    TEST_UNARY_FUNC_MACRO((fp_func_fract(is_embedded_profile)))
-    // gentype modf(gentype x, gentype* iptr);
-    TEST_UNARY_FUNC_MACRO((fp_func_modf(is_embedded_profile)))
-    // gentype frexp(gentype x, intn* exp);
-    TEST_UNARY_FUNC_MACRO((fp_func_frexp(is_embedded_profile)))
-
-    // gentype remainder(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((fp_func_remainder(is_embedded_profile)))
-    // gentype copysign(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((fp_func_copysign(is_embedded_profile)))
-    // gentype fmod(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((fp_func_fmod(is_embedded_profile)))
-
-    // gentype nextafter(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((fp_func_nextafter(is_embedded_profile)))
-
-    // gentype remquo(gentype x, gentype y, intn* quo);
-    TEST_BINARY_FUNC_MACRO((fp_func_remquo(is_embedded_profile)))
-
-    // gentype fma(gentype a, gentype b, gentype c);
-    TEST_TERNARY_FUNC_MACRO((fp_func_fma(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_FP_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/half_math_funcs.hpp b/test_conformance/clcpp/math_funcs/half_math_funcs.hpp
deleted file mode 100644
index d72d717ce1..0000000000
--- a/test_conformance/clcpp/math_funcs/half_math_funcs.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_HALF_MATH_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_HALF_MATH_FUNCS_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)  
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, cos, half_cos, std::cos, true, 8192.0f, 8192.0f, 0.1f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, sin, half_sin, std::sin, true, 8192.0f, 8192.0f, 0.1f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, tan, half_tan, std::tan, true, 8192.0f, 8192.0f, 0.1f, -CL_M_PI_F, CL_M_PI_F)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, exp, half_exp, std::exp, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, exp2, half_exp2, std::exp2, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, exp10, half_exp10, reference::exp10, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, log, half_log, std::log, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, log2, half_log2, std::log2, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, log10, half_log10, std::log10, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, rsqrt, half_rsqrt, reference::rsqrt, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, sqrt, half_sqrt, std::sqrt, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, recip, half_recip, reference::recip, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2
-MATH_FUNCS_DEFINE_BINARY_FUNC1(half_math, divide, half_divide, reference::divide, true, 8192.0f, 8192.0f, 0.1f, -1024.0f, 1024.0f, -1024.0f, 1024.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC1(half_math, powr, half_powr, reference::powr, true, 8192.0f, 8192.0f, 0.1f, -1024.0f, 1024.0f, -1024.0f, 1024.0f)
-#else
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, cos, half_math::cos, std::cos, true, 8192.0f, 8192.0f, 0.1f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, sin, half_math::sin, std::sin, true, 8192.0f, 8192.0f, 0.1f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, tan, half_math::tan, std::tan, true, 8192.0f, 8192.0f, 0.1f, -CL_M_PI_F, CL_M_PI_F)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, exp, half_math::exp, std::exp, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, exp2, half_math::exp2, std::exp2, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, exp10, half_math::exp10, reference::exp10, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, log, half_math::log, std::log, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, log2, half_math::log2, std::log2, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, log10, half_math::log10, std::log10, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, rsqrt, half_math::rsqrt, reference::rsqrt, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, sqrt, half_math::sqrt, std::sqrt, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-MATH_FUNCS_DEFINE_UNARY_FUNC1(half_math, recip, half_math::recip, reference::recip, true, 8192.0f, 8192.0f, 0.1f, -1000.0f, 1000.0f)
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2
-MATH_FUNCS_DEFINE_BINARY_FUNC1(half_math, divide, half_math::divide, reference::divide, true, 8192.0f, 8192.0f, 0.1f, -1024.0f, 1024.0f, -1024.0f, 1024.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC1(half_math, powr, half_math::powr, reference::powr, true, 8192.0f, 8192.0f, 0.1f, -1024.0f, 1024.0f, -1024.0f, 1024.0f)
-#endif
-
-// comparison functions
-AUTO_TEST_CASE(test_half_math_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    last_error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(last_error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    TEST_UNARY_FUNC_MACRO((half_math_func_cos(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((half_math_func_sin(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((half_math_func_tan(is_embedded_profile)))
-
-    TEST_UNARY_FUNC_MACRO((half_math_func_exp(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((half_math_func_exp2(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((half_math_func_exp10(is_embedded_profile)))
-
-    TEST_UNARY_FUNC_MACRO((half_math_func_log(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((half_math_func_log2(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((half_math_func_log10(is_embedded_profile)))
-
-    TEST_BINARY_FUNC_MACRO((half_math_func_divide(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_HALF_MATH_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/logarithmic_funcs.hpp b/test_conformance/clcpp/math_funcs/logarithmic_funcs.hpp
deleted file mode 100644
index 23e98302bc..0000000000
--- a/test_conformance/clcpp/math_funcs/logarithmic_funcs.hpp
+++ /dev/null
@@ -1,261 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_LOG_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_LOG_FUNCS_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-namespace detail
-{
-
-// This function reads values of FP_ILOGB0 and FP_ILOGBNAN macros defined on the device.
-// OpenCL C++ Spec:
-// The value of FP_ILOGB0 shall be either {INT_MIN} or {INT_MAX}. The value of FP_ILOGBNAN
-// shall be either {INT_MAX} or {INT_MIN}.
-int get_ilogb_nan_zero(cl_device_id device, cl_context context, cl_command_queue queue, cl_int& ilogb_nan, cl_int& ilogb_zero)
-{
-    cl_mem buffers[1];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str =
-        "__kernel void get_ilogb_nan_zero(__global int *out)\n"
-        "{\n"
-        "   out[0] = FP_ILOGB0;\n"
-        "   out[1] = FP_ILOGBNAN;\n"
-        "}\n";
-    std::string kernel_name("get_ilogb_nan_zero");
-
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-
-    std::vector<cl_int> output = generate_output<cl_int>(2);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_int) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    work_size[0] = 1;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_int) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    // Save
-    ilogb_zero = output[0];
-    ilogb_nan = output[1];
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-} // namespace detail
-
-struct logarithmic_func_ilogb : public unary_func<cl_float, cl_int>
-{
-    logarithmic_func_ilogb(cl_int ilogb_nan, cl_int ilogb_zero)
-        : m_ilogb_nan(ilogb_nan), m_ilogb_zero(ilogb_zero)
-    {
-
-    }
-
-    std::string str()
-    {
-        return "ilogb";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    cl_int operator()(const cl_float& x)
-    {
-        if((std::isnan)(x))
-        {
-            return m_ilogb_nan;
-        }
-        else if(x == 0.0 || x == -0.0)
-        {
-            return m_ilogb_zero;
-        }
-        static_assert(
-            sizeof(cl_int) == sizeof(int),
-            "Tests assumes that sizeof(cl_int) == sizeof(int)"
-        );
-        return (std::ilogb)(x);
-    }
-
-    cl_float min1()
-    {
-        return -100.0f;
-    }
-
-    cl_float max1()
-    {
-        return 1000.0f;
-    }
-
-    std::vector<cl_float> in1_special_cases()
-    {
-        return {
-            cl_float(0.0f),
-            cl_float(-0.0f),
-            cl_float(1.0f),
-            cl_float(-1.0f),
-            cl_float(2.0f),
-            cl_float(-2.0f),
-            std::numeric_limits<cl_float>::infinity(),
-            -std::numeric_limits<cl_float>::infinity(),
-            std::numeric_limits<cl_float>::quiet_NaN()
-        };
-    }
-private:
-    cl_int m_ilogb_nan;
-    cl_int m_ilogb_zero;
-};
-
-// gentype log(gentype x);
-// gentype logb(gentype x);
-// gentype log2(gentype x);
-// gentype log10(gentype x);
-// gentype log1p(gentype x);
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC(logarithmic, log, std::log, true, 3.0f, 4.0f, 0.001f, -10.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(logarithmic, logb, std::logb, true, 0.0f, 0.0f, 0.001f, -10.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(logarithmic, log2, std::log2, true, 3.0f, 4.0f, 0.001f, -10.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(logarithmic, log10, std::log10, true, 3.0f, 4.0f, 0.001f, -10.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(logarithmic, log1p, std::log1p, true, 2.0f, 4.0f, 0.001f, -10.0f, 1000.0f)
-
-// gentype lgamma(gentype x);
-// OpenCL C++ Spec.:
-// The ULP values for built-in math functions lgamma and lgamma_r is currently undefined.
-// Because of that we don't check ULP and set acceptable delta to 0.2f (20%).
-MATH_FUNCS_DEFINE_UNARY_FUNC(logarithmic, lgamma, std::lgamma, false, 0.0f, 0.0f, 0.2f, -10.0f, 1000.0f)
-
-// gentype lgamma_r(gentype x, intn* signp);
-// OpenCL C++ Spec.:
-// The ULP values for built-in math functions lgamma and lgamma_r is currently undefined.
-// Because of that we don't check ULP and set acceptable delta to 0.2f (20%).
-//
-// Note:
-// We DO NOT test if sign of the gamma function return by lgamma_r is correct.
-MATH_FUNCS_DEFINE_UNARY_FUNC(logarithmic, lgamma_r, std::lgamma, false, 0.0f, 0.0f, 0.2f, -10.0f, 1000.0f)
-
-// We need to specialize generate_kernel_unary<>() function template for logarithmic_func_lgamma_r
-// because it takes two arguments, but only one of it is input, the 2nd one is used to return
-// the sign of the gamma function.
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <>
-std::string generate_kernel_unary<logarithmic_func_lgamma_r, cl_float, cl_float>(logarithmic_func_lgamma_r func)
-{
-    return
-        "__kernel void test_lgamma_r(global float *input, global float *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    int sign;\n"
-        "    output[gid] = lgamma_r(input[gid], &sign);\n"
-        "}\n";
-}
-#else
-template <>
-std::string generate_kernel_unary<logarithmic_func_lgamma_r, cl_float, cl_float>(logarithmic_func_lgamma_r func)
-{
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_lgamma_r(global_ptr<float[]> input, global_ptr<float[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    int sign;\n"
-        "    output[gid] = lgamma_r(input[gid], &sign);\n"
-        "}\n";
-}
-#endif
-
-// logarithmic functions
-AUTO_TEST_CASE(test_logarithmic_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    // Write values of FP_ILOGB0 and FP_ILOGBNAN, which are macros defined on the device, to
-    // ilogb_zero and ilogb_nan.
-    cl_int ilogb_nan = 0;
-    cl_int ilogb_zero = 0;
-    error = detail::get_ilogb_nan_zero(device, context, queue, ilogb_nan, ilogb_zero);
-    RETURN_ON_ERROR_MSG(error, "detail::get_ilogb_nan_zero function failed");
-
-    // intn ilogb(gentype x);
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_ilogb(ilogb_nan, ilogb_zero)))
-
-    // gentype log(gentype x);
-    // gentype logb(gentype x);
-    // gentype log2(gentype x);
-    // gentype log10(gentype x);
-    // gentype log1p(gentype x);
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_log(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_logb(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_log2(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_log10(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_log1p(is_embedded_profile)))
-
-    // gentype lgamma(gentype x);
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_lgamma(is_embedded_profile)))
-
-    // gentype lgamma(gentype x);
-    //
-    // Note:
-    // We DO NOT test if sign of the gamma function return by lgamma_r is correct
-    TEST_UNARY_FUNC_MACRO((logarithmic_func_lgamma_r(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_LOG_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/main.cpp b/test_conformance/clcpp/math_funcs/main.cpp
deleted file mode 100644
index b51348712b..0000000000
--- a/test_conformance/clcpp/math_funcs/main.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include <limits>
-
-#include "../common.hpp"
-
-#include "comparison_funcs.hpp"
-#include "exponential_funcs.hpp"
-#include "floating_point_funcs.hpp"
-#include "half_math_funcs.hpp"
-#include "logarithmic_funcs.hpp"
-#include "other_funcs.hpp"
-#include "power_funcs.hpp"
-#include "trigonometric_funcs.hpp"
-
-int main(int argc, const char *argv[])
-{
-    // Check if cl_float (float) and cl_double (double) fulfill the requirements of
-    // IEC 559 (IEEE 754) standard. This is required for the tests to run correctly.
-    if(!std::numeric_limits<cl_float>::is_iec559)
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "cl_float (float) does not fulfill the requirements of IEC 559 (IEEE 754) standard. "
-            "Tests won't run correctly."
-        );
-    }
-    if(!std::numeric_limits<cl_double>::is_iec559)
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "cl_double (double) does not fulfill the requirements of IEC 559 (IEEE 754) standard. "
-            "Tests won't run correctly."
-        );
-    }
-
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/math_funcs/other_funcs.hpp b/test_conformance/clcpp/math_funcs/other_funcs.hpp
deleted file mode 100644
index f939a5674a..0000000000
--- a/test_conformance/clcpp/math_funcs/other_funcs.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_OTHER_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_OTHER_FUNCS_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC(other, erfc, std::erfc, true, 16.0f, 16.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(other, erf, std::erf, true, 16.0f, 16.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(other, fabs, std::fabs, true, 0.0f, 0.0f, 0.001f, -1000.0f, 1000.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(other, tgamma, std::tgamma, true, 16.0f, 16.0f, 0.001f, -1000.0f, 1000.0f)
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2
-MATH_FUNCS_DEFINE_BINARY_FUNC(other, hypot, std::hypot, true, 4.0f, 4.0f, 0.001f, -1000.0f, 1000.0f, -1000.0f, 1000.0f)
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2, min3, max3
-MATH_FUNCS_DEFINE_TERNARY_FUNC(other, mad, reference::mad, false, 0.0f, 0.0f, 0.1f, -10.0f, 10.0f, -10.0f, 10.0f, -10.0f, 10.0f)
-
-// other functions
-AUTO_TEST_CASE(test_other_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    last_error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(last_error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    // gentype erf(gentype x);
-    // gentype erfc(gentype x);
-    TEST_UNARY_FUNC_MACRO((other_func_erfc(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((other_func_erf(is_embedded_profile)))
-
-    // gentype fabs(gentype x);
-    TEST_UNARY_FUNC_MACRO((other_func_fabs(is_embedded_profile)))
-
-    // gentype tgamma(gentype x);
-    TEST_UNARY_FUNC_MACRO((other_func_tgamma(is_embedded_profile)))
-
-    // gentype hypot(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((other_func_hypot(is_embedded_profile)))
-
-    // gentype mad(gentype a, gentype b, gentype c);
-    TEST_TERNARY_FUNC_MACRO((other_func_mad(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_OTHER_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/power_funcs.hpp b/test_conformance/clcpp/math_funcs/power_funcs.hpp
deleted file mode 100644
index 2ace9b357c..0000000000
--- a/test_conformance/clcpp/math_funcs/power_funcs.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_POWER_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_POWER_FUNCS_HPP
-
-#include <limits>
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-#define DEFINE_BINARY_POWER_FUNC_INT(NAME, HOST_FUNC, USE_ULP, ULP, ULP_EMBEDDED, MIN1, MAX1, MIN2, MAX2) \
-struct power_func_ ## NAME : public binary_func<cl_float, cl_int, cl_float> \
-{ \
-    power_func_ ## NAME(bool is_embedded) : m_is_embedded(is_embedded)  \
-    { \
-    \
-    } \
-    \
-    std::string str() \
-    { \
-        return #NAME; \
-    } \
-    \
-    std::string headers()  \
-    { \
-        return "#include <opencl_math>\n"; \
-    } \
-    /* Reference value type is cl_double */ \
-    cl_double operator()(const cl_float& x, const cl_int& y)  \
-    { \
-        return (HOST_FUNC)(static_cast<cl_double>(x), y); \
-    } \
-    \
-    cl_float min1() \
-    { \
-        return MIN1; \
-    } \
-    \
-    cl_float max1() \
-    { \
-        return MAX1; \
-    } \
-    \
-    cl_int min2() \
-    { \
-        return MIN2; \
-    } \
-    \
-    cl_int max2() \
-    { \
-        return MAX2; \
-    } \
-    \
-    std::vector<cl_float> in1_special_cases() \
-    { \
-        return {  \
-            cl_float(-1.0f), \
-            cl_float(0.0f), \
-            cl_float(-0.0f), \
-        }; \
-    } \
-    \
-    std::vector<cl_int> in2_special_cases() \
-    { \
-        return {  \
-            2, 3, -1, 1, -2, 2 \
-        }; \
-    } \
-    \
-    bool use_ulp() \
-    { \
-        return USE_ULP; \
-    } \
-    \
-    float ulp() \
-    { \
-        if(m_is_embedded) \
-        { \
-            return ULP_EMBEDDED; \
-        } \
-        return ULP; \
-    } \
-private: \
-    bool m_is_embedded; \
-};
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC(power, cbrt, std::cbrt, true, 2.0f, 4.0f, 0.001f, -1000.0f, -9.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(power, rsqrt, reference::rsqrt, true, 2.0f, 4.0f, 0.001f, 1.0f, 100.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(power, sqrt, std::sqrt, true, 3.0f, 4.0f, 0.001f, 1.0f, 100.0f)
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2
-MATH_FUNCS_DEFINE_BINARY_FUNC(power, pow, std::pow, true, 16.0f, 16.0f, 0.001f, 1.0f, 100.0f, 1.0f, 10.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(power, powr, reference::powr, true, 16.0f, 16.0f, 0.001f, 1.0f, 100.0f, 1.0f, 10.0f)
-
-// func_name, reference_func, use_ulp, ulp, ulp_for_embedded, min1, max1, min2, max2
-DEFINE_BINARY_POWER_FUNC_INT(pown, std::pow, true, 16.0f, 16.0f, 1.0f, 100.0f, 1, 10)
-DEFINE_BINARY_POWER_FUNC_INT(rootn, reference::rootn, true, 16.0f, 16.0f, -100.0f, 100.0f, -10, 10)
-
-// power functions
-AUTO_TEST_CASE(test_power_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    last_error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(last_error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    // gentype cbrt(gentype x);
-    // gentype rsqrt(gentype x);
-    // gentype sqrt(gentype x);
-    TEST_UNARY_FUNC_MACRO((power_func_cbrt(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((power_func_sqrt(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((power_func_rsqrt(is_embedded_profile)))
-
-    // gentype pow(gentype x, gentype y);
-    // gentype powr(gentype x, gentype y);
-    TEST_BINARY_FUNC_MACRO((power_func_pow(is_embedded_profile)))
-    TEST_BINARY_FUNC_MACRO((power_func_powr(is_embedded_profile)))
-
-    // gentype pown(gentype x, intn y);
-    // gentype rootn(gentype x, intn y);
-    TEST_BINARY_FUNC_MACRO((power_func_pown(is_embedded_profile)))
-    TEST_BINARY_FUNC_MACRO((power_func_rootn(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_POWER_FUNCS_HPP
diff --git a/test_conformance/clcpp/math_funcs/reference.hpp b/test_conformance/clcpp/math_funcs/reference.hpp
deleted file mode 100644
index 0f5fc2fc21..0000000000
--- a/test_conformance/clcpp/math_funcs/reference.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_REFERENCE_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_REFERENCE_HPP
-
-#include <type_traits>
-#include <cmath>
-#include <limits>
-
-#include "../common.hpp"
-
-namespace reference
-{
-    // Reference functions for OpenCL comparison functions that
-    // are not already defined in STL.
-    cl_float maxmag(const cl_float& x, const cl_float& y)
-    {
-        if((std::abs)(x) > (std::abs)(y))
-        {
-            return x;
-        }
-        else if((std::abs)(y) > (std::abs)(x))
-        {
-            return y;
-        }
-        return (std::fmax)(x, y);
-    }
-
-    cl_float minmag(const cl_float& x, const cl_float& y)
-    {
-        if((std::abs)(x) < (std::abs)(y))
-        {
-            return x;
-        }
-        else if((std::abs)(y) < (std::abs)(x))
-        {
-            return y;
-        }
-        return (std::fmin)(x, y);
-    }
-
-    // Reference functions for OpenCL exp functions that
-    // are not already defined in STL.
-    cl_double exp10(const cl_double& x)
-    {   
-        // 10^x = exp2( x * log2(10) )
-        auto log2_10 = (std::log2)(static_cast<long double>(10.0));
-        cl_double x_log2_10 = static_cast<cl_double>(x * log2_10);
-        return (std::exp2)(x_log2_10);
-    }
-
-    // Reference functions for OpenCL floating point functions that
-    // are not already defined in STL.
-    cl_double2 fract(cl_double x)
-    {
-        // Copied from math_brute_force/reference_math.c
-        cl_double2 r;
-        if((std::isnan)(x))
-        {
-            r.s[0] = std::numeric_limits<cl_double>::quiet_NaN();
-            r.s[1] = std::numeric_limits<cl_double>::quiet_NaN();
-            return r;
-        }
-
-        r.s[0] = (std::modf)(x, &(r.s[1]));
-        if(r.s[0] < 0.0 )
-        {
-            r.s[0] = 1.0f + r.s[0];
-            r.s[1] -= 1.0f;
-            if( r.s[0] == 1.0f )
-                r.s[0] = HEX_FLT(+, 1, fffffe, -, 1);
-        }
-        return r;
-    }
-
-    cl_double2 remquo(cl_double x, cl_double y)
-    {
-        cl_double2 r;
-        // remquo return the same value that is returned by the
-        // remainder function
-        r.s[0] = (std::remainder)(x,y);
-        // calulcate quo
-        cl_double x_y = (x - r.s[0]) / y;
-        cl_uint quo = (std::abs)(x_y);
-        r.s[1] = quo & 0x0000007fU;
-        if(x_y < 0.0)
-            r.s[1] = -r.s[1];
-
-        // fix edge cases
-        if(!(std::isnan)(x) && y == 0.0)
-        {
-            r.s[1] = 0;
-        }
-        else if((std::isnan)(x) && (std::isnan)(y))
-        {
-            r.s[1] = 0;
-        }
-        return r;
-    }
-
-    // Reference functions for OpenCL half_math:: functions that
-    // are not already defined in STL.
-    cl_double divide(cl_double x, cl_double y)
-    {
-        return x / y;
-    }
-
-    cl_double recip(cl_double x)
-    {
-        return 1.0 / x;
-    }
-
-    // Reference functions for OpenCL other functions that
-    // are not already defined in STL.
-    cl_double mad(cl_double x, cl_double y, cl_double z)
-    {
-        return (x * y) + z;
-    }
-
-    // Reference functions for OpenCL power functions that
-    // are not already defined in STL.
-    cl_double rsqrt(const cl_double& x)
-    {
-        return cl_double(1.0) / ((std::sqrt)(x));
-    }
-
-    cl_double powr(const cl_double& x, const cl_double& y)
-    {
-        //powr(x, y) returns NaN for x < 0.
-        if( x < 0.0 )
-            return std::numeric_limits<cl_double>::quiet_NaN();
-
-        //powr ( x, NaN ) returns the NaN for x >= 0.
-        //powr ( NaN, y ) returns the NaN.
-        if((std::isnan)(x) || (std::isnan)(y) )
-            return std::numeric_limits<cl_double>::quiet_NaN();
-
-        if( x == 1.0 )
-        {
-            //powr ( +1, +-inf ) returns NaN.
-            if((std::abs)(y) == INFINITY )
-                return std::numeric_limits<cl_double>::quiet_NaN();
-
-            //powr ( +1, y ) is 1 for finite y. (NaN handled above)
-            return 1.0;
-        }
-
-        if( y == 0.0 )
-        {
-            //powr ( +inf, +-0 ) returns NaN.
-            //powr ( +-0, +-0 ) returns NaN.
-            if( x == 0.0 || x == std::numeric_limits<cl_double>::infinity())
-                return std::numeric_limits<cl_double>::quiet_NaN();
-
-            //powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already handled above)
-            return 1.0;
-        }
-
-        if( x == 0.0 )
-        {
-            //powr ( +-0, -inf) is +inf.
-            //powr ( +-0, y ) is +inf for finite y < 0.
-            if( y < 0.0 )
-                return std::numeric_limits<cl_double>::infinity();
-
-            //powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
-            return 0.0;
-        }
-
-        // x = +inf
-        if( (std::isinf)(x) )
-        {
-            if( y < 0 )
-                return 0;
-            return std::numeric_limits<cl_double>::infinity();
-        }
-
-        double fabsx = (std::abs)(x);
-        double fabsy = (std::abs)(y);
-
-        //y = +-inf cases
-        if( (std::isinf)(fabsy) )
-        {
-            if( y < 0.0 )
-            {
-                if( fabsx < 1.0 )
-                    return std::numeric_limits<cl_double>::infinity();
-                return 0;
-            }
-            if( fabsx < 1.0 )
-                return 0.0;
-            return std::numeric_limits<cl_double>::infinity();
-        }        
-        return (std::pow)(x, y);
-    }
-
-    cl_double rootn(const cl_double& x, const cl_int n)
-    {
-        //rootn (x, 0) returns a NaN.
-        if(n == 0)
-            return std::numeric_limits<cl_double>::quiet_NaN();
-
-        //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-        if(x < 0 && 0 == (n & 1))
-            return std::numeric_limits<cl_double>::quiet_NaN();
-
-        if(x == 0.0)
-        {
-            if(n > 0)
-            {
-                //rootn ( +-0,  n ) is +0 for even n > 0.
-                if(0 == (n & 1))
-                {
-                    return cl_double(0.0);
-                }
-                //rootn ( +-0,  n ) is +-0 for odd n > 0.
-                else
-                {
-                    return x;
-                }
-            }
-            else
-            {
-                //rootn ( +-0,  n ) is +inf for even n < 0.
-                if(0 == ((-n) & 1))
-                {
-                    return std::numeric_limits<cl_double>::infinity();
-                }
-                //rootn ( +-0,  n ) is +-inf for odd n < 0.
-                else
-                {
-                    return (std::copysign)(
-                        std::numeric_limits<cl_double>::infinity(), x
-                    );
-                }   
-            }
-        }
-
-        cl_double r = (std::abs)(x);
-        r = (std::exp2)((std::log2)(r) / static_cast<cl_double>(n));
-        return (std::copysign)(r, x);
-    }
-
-    // Reference functions for OpenCL trigonometric functions that
-    // are not already defined in STL.
-    cl_double acospi(cl_double x)
-    {
-        return (std::acos)(x) / CL_M_PI;
-    }
-
-    cl_double asinpi(cl_double x)
-    {
-        return (std::asin)(x) / CL_M_PI;
-    }
-
-    cl_double atanpi(cl_double x)
-    {
-        return (std::atan)(x) / CL_M_PI;
-    }
-
-    cl_double cospi(cl_double x)
-    {
-        return (std::cos)(x * CL_M_PI);
-    }
-
-    cl_double sinpi(cl_double x)
-    {
-        return (std::sin)(x * CL_M_PI);
-    }
-
-    cl_double tanpi(cl_double x)
-    {
-        return (std::tan)(x * CL_M_PI);
-    }
-
-    cl_double atan2(cl_double x, cl_double y)
-    {
-    #if defined(WIN32) || defined(_WIN32) 
-        // Fix edge cases for Windows
-        if ((std::isinf)(x) && (std::isinf)(y)) {
-            cl_double retval = (y > 0) ? CL_M_PI_4 : 3.f * CL_M_PI_4;
-            return (x > 0) ? retval : -retval;
-        }
-    #endif // defined(WIN32) || defined(_WIN32) 
-        return (std::atan2)(x, y);
-    }
-
-    cl_double atan2pi(cl_double x, cl_double y)
-    {
-        return ::reference::atan2(x, y) / CL_M_PI;
-    }
-
-    cl_double2 sincos(cl_double x)
-    {
-        cl_double2 r;
-        r.s[0] = (std::sin)(x);
-        r.s[1] = (std::cos)(x);
-        return r;
-    }
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_REFERENCE_HPP
diff --git a/test_conformance/clcpp/math_funcs/trigonometric_funcs.hpp b/test_conformance/clcpp/math_funcs/trigonometric_funcs.hpp
deleted file mode 100644
index 343024a891..0000000000
--- a/test_conformance/clcpp/math_funcs/trigonometric_funcs.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_MATH_FUNCS_TRI_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_MATH_FUNCS_TRI_FUNCS_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "common.hpp"
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, acos, std::acos, true, 4.0f, 4.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, acosh, std::acosh, true, 4.0f, 4.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, acospi, reference::acospi, true, 5.0f, 5.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, asin, std::asin, true, 4.0f, 4.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, asinh, std::asinh, true, 4.0f, 4.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, asinpi, reference::asinpi, true, 5.0f, 5.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, atan, std::atan, true, 5.0f, 5.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, atanh, std::atanh, true, 5.0f, 5.0f, 0.001f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, atanpi, reference::atanpi, true, 5.0f, 5.0f, 0.001f, -1.0f, 1.0f)
-
-// For (sin/cos/tan)pi functions min input value is -0.24 and max input value is 0.24,
-// so (CL_M_PI * x) is never greater than CL_M_PI_F.
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, cos, std::cos, true, 4.0f, 4.0f, 0.001f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, cosh, std::cosh, true, 4.0f, 4.0f, 0.001f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, cospi, reference::cospi, true, 4.0f, 4.0f, 0.001f, -0.24, -0.24f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, sin, std::sin, true, 4.0f, 4.0f, 0.001f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, sinh, std::sinh, true, 4.0f, 4.0f, 0.001f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, sinpi, reference::sinpi, true, 4.0f, 4.0f, 0.001f, -0.24, -0.24f)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, tan, std::tan, true, 5.0f, 5.0f, 0.001f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, tanh, std::tanh, true, 5.0f, 5.0f, 0.001f, -CL_M_PI_F, CL_M_PI_F)
-MATH_FUNCS_DEFINE_UNARY_FUNC(trigonometric, tanpi, reference::tanpi, true, 6.0f, 6.0f, 0.001f, -0.24, -0.24f)
-
-// group_name, func_name, reference_func, use_ulp, ulp, ulp_for_embedded, max_delta, min1, max1, min2, max2
-MATH_FUNCS_DEFINE_BINARY_FUNC(trigonometric, atan2, reference::atan2, true, 6.0f, 6.0f, 0.001f, -1.0f, 1.0f, -1.0f, 1.0f)
-MATH_FUNCS_DEFINE_BINARY_FUNC(trigonometric, atan2pi, reference::atan2pi, true, 6.0f, 6.0f, 0.001f, -1.0f, 1.0f, -1.0f, 1.0f)
-
-// gentype sincos(gentype x, gentype * cosval);
-//
-// Fact that second argument is a pointer is inconvenient.
-//
-// We don't want to modify all helper functions defined in funcs_test_utils.hpp
-// that run test kernels generated based on this class and check if results are
-// correct, so instead of having two output cl_float buffers, one for sines and
-// one for cosines values, we use one cl_float2 output buffer (first component is
-// sine, second is cosine).
-//
-// Below we also define specialization of generate_kernel_unary function template
-// for trigonometric_func_sincos.
-struct trigonometric_func_sincos : public unary_func<cl_float, cl_float2>
-{
-    trigonometric_func_sincos(bool is_embedded) : m_is_embedded(is_embedded) 
-    {
-
-    }
-
-    std::string str()
-    {
-        return "sincos";
-    }
-
-    std::string headers() 
-    {
-        return "#include <opencl_math>\n";
-    }
-
-    /* Reference value type is cl_double */
-    cl_double2 operator()(const cl_float& x) 
-    {
-        return (reference::sincos)(static_cast<cl_double>(x));
-    }
-
-    cl_float min1()
-    {
-        return -CL_M_PI_F;
-    }
-
-    cl_float max1()
-    {
-        return CL_M_PI_F;
-    }
-
-    bool use_ulp()
-    {
-        return true;
-    }
-
-    float ulp()
-    {
-        if(m_is_embedded)
-        {
-            return 4.0f;
-        }
-        return 4.0f;
-    }
-private:
-    bool m_is_embedded;
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)    
-template <>
-std::string generate_kernel_unary<trigonometric_func_sincos, cl_float, cl_float2>(trigonometric_func_sincos func)
-{    
-    return 
-        "__kernel void test_sincos(global float *input, global float2 *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 sine_cosine_of_x;\n"
-        "    float cosine_of_x = 0;\n"
-        "    sine_cosine_of_x.x = sincos(input[gid], &(cosine_of_x));\n"
-        "    sine_cosine_of_x.y = cosine_of_x;\n"
-        "    output[gid] = sine_cosine_of_x;\n"
-        "}\n";
-}
-#else
-template <>
-std::string generate_kernel_unary<trigonometric_func_sincos, cl_float, cl_float2>(trigonometric_func_sincos func)
-{
-    return         
-        "" + func.defs() + 
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_sincos(global_ptr<float[]> input, global_ptr<float2[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    float2 sine_cosine_of_x;\n"
-        "    float cosine_of_x = 0;\n"
-        "    sine_cosine_of_x.x = sincos(input[gid], &(cosine_of_x));\n"
-        "    sine_cosine_of_x.y = cosine_of_x;\n"
-        "    output[gid] = sine_cosine_of_x;\n"
-        "}\n";
-}
-#endif
-
-// trigonometric functions
-AUTO_TEST_CASE(test_trigonometric_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    // Check for EMBEDDED_PROFILE
-    bool is_embedded_profile = false;
-    char profile[128];
-    last_error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    RETURN_ON_CL_ERROR(last_error, "clGetDeviceInfo")
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        is_embedded_profile = true;
-
-    // gentype acos(gentype x);
-    // gentype acosh(gentype x);
-    // gentype acospi(gentype x);
-    // gentype asin(gentype x);
-    // gentype asinh(gentype x);
-    // gentype asinpi(gentype x);
-    // gentype atan(gentype x);
-    // gentype atanh(gentype x);
-    // gentype atanpi(gentype x);
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_acos(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_acosh(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_acospi(is_embedded_profile))) 
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_asin(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_asinh(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_asinpi(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_atan(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_atanh(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_atanpi(is_embedded_profile)))
-
-    // gentype cos(gentype x);
-    // gentype cosh(gentype x);
-    // gentype cospi(gentype x);
-    // gentype sin(gentype x);
-    // gentype sinh(gentype x);
-    // gentype sinpi(gentype x);
-    // gentype tan(gentype x);
-    // gentype tanh(gentype x);
-    // gentype tanpi(gentype x);
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_cos(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_cosh(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_cospi(is_embedded_profile))) 
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_sin(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_sinh(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_sinpi(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_tan(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_tanh(is_embedded_profile)))
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_tanpi(is_embedded_profile)))
-
-    // gentype atan2(gentype y, gentype x);
-    // gentype atan2pi(gentype y, gentype x);
-    TEST_BINARY_FUNC_MACRO((trigonometric_func_atan2(is_embedded_profile)))
-    TEST_BINARY_FUNC_MACRO((trigonometric_func_atan2pi(is_embedded_profile)))
-
-    // gentype sincos(gentype x, gentype * cosval);
-    TEST_UNARY_FUNC_MACRO((trigonometric_func_sincos(is_embedded_profile)))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_MATH_FUNCS_TRI_FUNCS_HPP
diff --git a/test_conformance/clcpp/pipes/CMakeLists.txt b/test_conformance/clcpp/pipes/CMakeLists.txt
deleted file mode 100644
index 65daae9750..0000000000
--- a/test_conformance/clcpp/pipes/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_PIPES)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/pipes/main.cpp b/test_conformance/clcpp/pipes/main.cpp
deleted file mode 100644
index 0ed4ef68ea..0000000000
--- a/test_conformance/clcpp/pipes/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_pipes.hpp"
-
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/pipes/test_pipes.hpp b/test_conformance/clcpp/pipes/test_pipes.hpp
deleted file mode 100644
index 3fc30dcd99..0000000000
--- a/test_conformance/clcpp/pipes/test_pipes.hpp
+++ /dev/null
@@ -1,632 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_PIPES_TEST_PIPES_HPP
-#define TEST_CONFORMANCE_CLCPP_PIPES_TEST_PIPES_HPP
-
-#include <sstream>
-#include <string>
-#include <tuple>
-#include <vector>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_pipes {
-
-enum class pipe_source
-{
-    param,
-    storage
-};
-
-enum class pipe_operation
-{
-    work_item,
-    work_item_reservation,
-    work_group_reservation,
-    sub_group_reservation
-};
-
-struct test_options
-{
-    pipe_operation operation;
-    pipe_source source;
-    int max_packets;
-    int num_packets;
-};
-
-struct output_type
-{
-    cl_uint write_reservation_is_valid;
-    cl_uint write_success;
-
-    cl_uint num_packets;
-    cl_uint max_packets;
-    cl_uint read_reservation_is_valid;
-    cl_uint read_success;
-
-    cl_uint value;
-};
-
-const std::string source_common = R"(
-struct output_type
-{
-    uint write_reservation_is_valid;
-    uint write_success;
-
-    uint num_packets;
-    uint max_packets;
-    uint read_reservation_is_valid;
-    uint read_success;
-
-    uint value;
-};
-)";
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << source_common;
-    if (options.operation == pipe_operation::work_item)
-    {
-        s << R"(
-    kernel void producer(write_only pipe uint out_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        output[gid].write_reservation_is_valid = 1;
-
-        uint value = gid;
-        output[gid].write_success = write_pipe(out_pipe, &value) == 0;
-    }
-
-    kernel void consumer(read_only pipe uint in_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        output[gid].num_packets = get_pipe_num_packets(in_pipe);
-        output[gid].max_packets = get_pipe_max_packets(in_pipe);
-
-        output[gid].read_reservation_is_valid = 1;
-
-        uint value;
-        output[gid].read_success = read_pipe(in_pipe, &value) == 0;
-        output[gid].value = value;
-    }
-    )";
-    }
-    else if (options.operation == pipe_operation::work_item_reservation)
-    {
-        s << R"(
-    kernel void producer(write_only pipe uint out_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-        if (gid % 2 == 1) return;
-
-        reserve_id_t reservation = reserve_write_pipe(out_pipe, 2);
-        output[gid + 0].write_reservation_is_valid = is_valid_reserve_id(reservation);
-        output[gid + 1].write_reservation_is_valid = is_valid_reserve_id(reservation);
-
-        uint value0 = gid + 0;
-        uint value1 = gid + 1;
-        output[gid + 0].write_success = write_pipe(out_pipe, reservation, 0, &value0) == 0;
-        output[gid + 1].write_success = write_pipe(out_pipe, reservation, 1, &value1) == 0;
-        commit_write_pipe(out_pipe, reservation);
-    }
-
-    kernel void consumer(read_only pipe uint in_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-        if (gid % 2 == 1) return;
-
-        output[gid + 0].num_packets = get_pipe_num_packets(in_pipe);
-        output[gid + 0].max_packets = get_pipe_max_packets(in_pipe);
-        output[gid + 1].num_packets = get_pipe_num_packets(in_pipe);
-        output[gid + 1].max_packets = get_pipe_max_packets(in_pipe);
-
-        reserve_id_t reservation = reserve_read_pipe(in_pipe, 2);
-        output[gid + 0].read_reservation_is_valid = is_valid_reserve_id(reservation);
-        output[gid + 1].read_reservation_is_valid = is_valid_reserve_id(reservation);
-
-        uint value0;
-        uint value1;
-        output[gid + 0].read_success = read_pipe(in_pipe, reservation, 1, &value0) == 0;
-        output[gid + 1].read_success = read_pipe(in_pipe, reservation, 0, &value1) == 0;
-        commit_read_pipe(in_pipe, reservation);
-        output[gid + 0].value = value0;
-        output[gid + 1].value = value1;
-    }
-    )";
-    }
-    else if (options.operation == pipe_operation::work_group_reservation)
-    {
-        s << R"(
-    kernel void producer(write_only pipe uint out_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        reserve_id_t reservation = work_group_reserve_write_pipe(out_pipe, get_local_size(0));
-        output[gid].write_reservation_is_valid = is_valid_reserve_id(reservation);
-
-        uint value = gid;
-        output[gid].write_success = write_pipe(out_pipe, reservation, get_local_id(0), &value) == 0;
-        work_group_commit_write_pipe(out_pipe, reservation);
-    }
-
-    kernel void consumer(read_only pipe uint in_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        output[gid].num_packets = get_pipe_num_packets(in_pipe);
-        output[gid].max_packets = get_pipe_max_packets(in_pipe);
-
-        reserve_id_t reservation = work_group_reserve_read_pipe(in_pipe, get_local_size(0));
-        output[gid].read_reservation_is_valid = is_valid_reserve_id(reservation);
-
-        uint value;
-        output[gid].read_success = read_pipe(in_pipe, reservation, get_local_size(0) - 1 - get_local_id(0), &value) == 0;
-        work_group_commit_read_pipe(in_pipe, reservation);
-        output[gid].value = value;
-    }
-    )";
-    }
-    else if (options.operation == pipe_operation::sub_group_reservation)
-    {
-        s << R"(
-    #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-
-    kernel void producer(write_only pipe uint out_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        reserve_id_t reservation = sub_group_reserve_write_pipe(out_pipe, get_sub_group_size());
-        output[gid].write_reservation_is_valid = is_valid_reserve_id(reservation);
-
-        uint value = gid;
-        output[gid].write_success = write_pipe(out_pipe, reservation, get_sub_group_local_id(), &value) == 0;
-        sub_group_commit_write_pipe(out_pipe, reservation);
-    }
-
-    kernel void consumer(read_only pipe uint in_pipe, global struct output_type *output)
-    {
-        const ulong gid = get_global_id(0);
-
-        output[gid].num_packets = get_pipe_num_packets(in_pipe);
-        output[gid].max_packets = get_pipe_max_packets(in_pipe);
-
-        reserve_id_t reservation = sub_group_reserve_read_pipe(in_pipe, get_sub_group_size());
-        output[gid].read_reservation_is_valid = is_valid_reserve_id(reservation);
-
-        uint value;
-        output[gid].read_success = read_pipe(in_pipe, reservation, get_sub_group_size() - 1 - get_sub_group_local_id(), &value) == 0;
-        sub_group_commit_read_pipe(in_pipe, reservation);
-        output[gid].value = value;
-    }
-    )";
-    }
-
-    return s.str();
-}
-#else
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << R"(
-    #include <opencl_memory>
-    #include <opencl_common>
-    #include <opencl_work_item>
-    #include <opencl_synchronization>
-    #include <opencl_pipe>
-    using namespace cl;
-    )";
-
-    s << source_common;
-
-    std::string init_out_pipe;
-    std::string init_in_pipe;
-    if (options.source == pipe_source::param)
-    {
-        init_out_pipe = "auto out_pipe = pipe_param;";
-        init_in_pipe = "auto in_pipe = pipe_param;";
-    }
-    else if (options.source == pipe_source::storage)
-    {
-        s << "pipe_storage<uint, " << std::to_string(options.max_packets) << "> storage;";
-        init_out_pipe = "auto out_pipe = storage.get<pipe_access::write>();";
-        init_in_pipe = "auto in_pipe = make_pipe(storage);";
-    }
-
-    if (options.operation == pipe_operation::work_item)
-    {
-        s << R"(
-    kernel void producer(pipe<uint, pipe_access::write> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_out_pipe << R"(
-        const ulong gid = get_global_id(0);
-
-        output[gid].write_reservation_is_valid = 1;
-
-        uint value = gid;
-        output[gid].write_success = out_pipe.write(value);
-    }
-
-    kernel void consumer(pipe<uint, pipe_access::read> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_in_pipe << R"(
-        const ulong gid = get_global_id(0);
-
-        output[gid].num_packets = in_pipe.num_packets();
-        output[gid].max_packets = in_pipe.max_packets();
-
-        output[gid].read_reservation_is_valid = 1;
-
-        uint value;
-        output[gid].read_success = in_pipe.read(value);
-        output[gid].value = value;
-    }
-    )";
-    }
-    else if (options.operation == pipe_operation::work_item_reservation)
-    {
-        s << R"(
-    kernel void producer(pipe<uint, pipe_access::write> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_out_pipe << R"(
-        const ulong gid = get_global_id(0);
-        if (gid % 2 == 1) return;
-
-        auto reservation = out_pipe.reserve(2);
-        output[gid + 0].write_reservation_is_valid = reservation.is_valid();
-        output[gid + 1].write_reservation_is_valid = reservation.is_valid();
-
-        uint value0 = gid + 0;
-        uint value1 = gid + 1;
-        output[gid + 0].write_success = reservation.write(0, value0);
-        output[gid + 1].write_success = reservation.write(1, value1);
-        reservation.commit();
-    }
-
-    kernel void consumer(pipe<uint, pipe_access::read> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_in_pipe << R"(
-        const ulong gid = get_global_id(0);
-        if (gid % 2 == 1) return;
-
-        output[gid + 0].num_packets = in_pipe.num_packets();
-        output[gid + 0].max_packets = in_pipe.max_packets();
-        output[gid + 1].num_packets = in_pipe.num_packets();
-        output[gid + 1].max_packets = in_pipe.max_packets();
-
-        auto reservation = in_pipe.reserve(2);
-        output[gid + 0].read_reservation_is_valid = reservation.is_valid();
-        output[gid + 1].read_reservation_is_valid = reservation.is_valid();
-
-        uint value0;
-        uint value1;
-        output[gid + 0].read_success = reservation.read(1, value0);
-        output[gid + 1].read_success = reservation.read(0, value1);
-        reservation.commit();
-        output[gid + 0].value = value0;
-        output[gid + 1].value = value1;
-    }
-    )";
-    }
-    else if (options.operation == pipe_operation::work_group_reservation)
-    {
-        s << R"(
-    kernel void producer(pipe<uint, pipe_access::write> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_out_pipe << R"(
-        const ulong gid = get_global_id(0);
-
-        auto reservation = out_pipe.work_group_reserve(get_local_size(0));
-        output[gid].write_reservation_is_valid = reservation.is_valid();
-
-        uint value = gid;
-        output[gid].write_success = reservation.write(get_local_id(0), value);
-        reservation.commit();
-    }
-
-    kernel void consumer(pipe<uint, pipe_access::read> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_in_pipe << R"(
-        const ulong gid = get_global_id(0);
-
-        output[gid].num_packets = in_pipe.num_packets();
-        output[gid].max_packets = in_pipe.max_packets();
-
-        auto reservation = in_pipe.work_group_reserve(get_local_size(0));
-        output[gid].read_reservation_is_valid = reservation.is_valid();
-
-        uint value;
-        output[gid].read_success = reservation.read(get_local_size(0) - 1 - get_local_id(0), value);
-        reservation.commit();
-        output[gid].value = value;
-    }
-    )";
-    }
-    else if (options.operation == pipe_operation::sub_group_reservation)
-    {
-        s << R"(
-    kernel void producer(pipe<uint, pipe_access::write> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_out_pipe << R"(
-        const ulong gid = get_global_id(0);
-
-        auto reservation = out_pipe.sub_group_reserve(get_sub_group_size());
-        output[gid].write_reservation_is_valid = reservation.is_valid();
-
-        uint value = gid;
-        output[gid].write_success = reservation.write(get_sub_group_local_id(), value);
-        reservation.commit();
-    }
-
-    kernel void consumer(pipe<uint, pipe_access::read> pipe_param, global_ptr<output_type[]> output)
-    {
-        )" << init_in_pipe << R"(
-        const ulong gid = get_global_id(0);
-
-        output[gid].num_packets = in_pipe.num_packets();
-        output[gid].max_packets = in_pipe.max_packets();
-
-        auto reservation = in_pipe.sub_group_reserve(get_sub_group_size());
-        output[gid].read_reservation_is_valid = reservation.is_valid();
-
-        uint value;
-        output[gid].read_success = reservation.read(get_sub_group_size() - 1 - get_sub_group_local_id(), value);
-        reservation.commit();
-        output[gid].value = value;
-    }
-    )";
-    }
-
-    return s.str();
-}
-#endif
-
-int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-    if (options.num_packets % 2 != 0 || options.max_packets < options.num_packets)
-    {
-        RETURN_ON_ERROR_MSG(-1, "Invalid test options")
-    }
-
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    if (options.operation == pipe_operation::sub_group_reservation && !is_extension_available(device, "cl_khr_subgroups"))
-    {
-        log_info("SKIPPED: Extension `cl_khr_subgroups` is not supported. Skipping tests.\n");
-        return CL_SUCCESS;
-    }
-#endif
-
-    cl_program program;
-    cl_kernel producer_kernel;
-    cl_kernel consumer_kernel;
-
-    std::string producer_kernel_name = "producer";
-    std::string consumer_kernel_name = "consumer";
-    std::string source = generate_source(options);
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &producer_kernel,
-        source, producer_kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &producer_kernel,
-        source, producer_kernel_name, "-cl-std=CL2.0", false
-    );
-    RETURN_ON_ERROR(error)
-    consumer_kernel = clCreateKernel(program, consumer_kernel_name.c_str(), &error);
-    RETURN_ON_CL_ERROR(error, "clCreateKernel")
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &producer_kernel,
-        source, producer_kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    consumer_kernel = clCreateKernel(program, consumer_kernel_name.c_str(), &error);
-    RETURN_ON_CL_ERROR(error, "clCreateKernel")
-#endif
-
-    size_t max_work_group_size;
-    error = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    const size_t count = options.num_packets;
-    const size_t local_size = (std::min)((size_t)256, max_work_group_size);
-    const size_t global_size = count;
-
-    const cl_uint packet_size = sizeof(cl_uint);
-
-    cl_mem pipe = clCreatePipe(context, 0, packet_size, options.max_packets, NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreatePipe")
-
-    cl_mem output_buffer;
-    output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(output_type) * count, NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    const char pattern = 0;
-    error = clEnqueueFillBuffer(queue, output_buffer, &pattern, sizeof(pattern), 0, sizeof(output_type) * count, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueFillBuffer")
-
-    error = clSetKernelArg(producer_kernel, 0, sizeof(cl_mem), &pipe);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(producer_kernel, 1, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    error = clEnqueueNDRangeKernel(queue, producer_kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    error = clSetKernelArg(consumer_kernel, 0, sizeof(cl_mem), &pipe);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(consumer_kernel, 1, sizeof(output_buffer), &output_buffer);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    error = clEnqueueNDRangeKernel(queue, consumer_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    std::vector<output_type> output(count);
-    error = clEnqueueReadBuffer(
-        queue, output_buffer, CL_TRUE,
-        0, sizeof(output_type) * count,
-        static_cast<void *>(output.data()),
-        0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    std::vector<bool> existing_values(count, false);
-    for (size_t gid = 0; gid < count; gid++)
-    {
-        const output_type &o = output[gid];
-
-        if (!o.write_reservation_is_valid)
-        {
-            RETURN_ON_ERROR_MSG(-1, "write reservation is not valid")
-        }
-        if (!o.write_success)
-        {
-            RETURN_ON_ERROR_MSG(-1, "write did not succeed")
-        }
-
-        if (o.num_packets == 0 || o.num_packets > options.num_packets)
-        {
-            RETURN_ON_ERROR_MSG(-1, "num_packets did not return correct value")
-        }
-        if (o.max_packets != options.max_packets)
-        {
-            RETURN_ON_ERROR_MSG(-1, "max_packets did not return correct value")
-        }
-        if (!o.read_reservation_is_valid)
-        {
-            RETURN_ON_ERROR_MSG(-1, "read reservation is not valid")
-        }
-        if (!o.read_success)
-        {
-            RETURN_ON_ERROR_MSG(-1, "read did not succeed")
-        }
-
-        // Every value must be presented once in any order
-        if (o.value >= count || existing_values[o.value])
-        {
-            RETURN_ON_ERROR_MSG(-1, "kernel did not return correct value")
-        }
-        existing_values[o.value] = true;
-    }
-
-    clReleaseMemObject(pipe);
-    clReleaseMemObject(output_buffer);
-    clReleaseKernel(producer_kernel);
-    clReleaseKernel(consumer_kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-const pipe_operation pipe_operations[] = {
-    pipe_operation::work_item,
-    pipe_operation::work_item_reservation,
-    pipe_operation::work_group_reservation,
-    pipe_operation::sub_group_reservation
-};
-
-const std::tuple<int, int> max_and_num_packets[] = {
-    std::make_tuple<int, int>(2, 2),
-    std::make_tuple<int, int>(10, 8),
-    std::make_tuple<int, int>(256, 254),
-    std::make_tuple<int, int>(1 << 16, 1 << 16),
-    std::make_tuple<int, int>((1 << 16) + 5, 1 << 16),
-    std::make_tuple<int, int>(12345, 12344),
-    std::make_tuple<int, int>(1 << 18, 1 << 18)
-};
-
-AUTO_TEST_CASE(test_pipes_pipe)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    std::vector<std::tuple<int, int>> ps;
-    for (auto p : max_and_num_packets)
-    {
-        if (std::get<0>(p) < num_elements)
-            ps.push_back(p);
-    }
-    ps.push_back(std::tuple<int, int>(num_elements, num_elements));
-
-    int error = CL_SUCCESS;
-
-    for (auto operation : pipe_operations)
-    for (auto p : ps)
-    {
-        test_options options;
-        options.source = pipe_source::param;
-        options.max_packets = std::get<0>(p);
-        options.num_packets = std::get<1>(p);
-        options.operation = operation;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-AUTO_TEST_CASE(test_pipes_pipe_storage)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    std::vector<std::tuple<int, int>> ps;
-    for (auto p : max_and_num_packets)
-    {
-        if (std::get<0>(p) < num_elements)
-            ps.push_back(p);
-    }
-    ps.push_back(std::tuple<int, int>(num_elements, num_elements));
-
-    int error = CL_SUCCESS;
-
-    for (auto operation : pipe_operations)
-    for (auto p : ps)
-    {
-        test_options options;
-        options.source = pipe_source::storage;
-        options.max_packets = std::get<0>(p);
-        options.num_packets = std::get<1>(p);
-        options.operation = operation;
-
-        error = test(device, context, queue, options);
-        RETURN_ON_ERROR(error)
-    }
-
-    return error;
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_PIPES_TEST_PIPES_HPP
diff --git a/test_conformance/clcpp/program_scope_ctors_dtors/CMakeLists.txt b/test_conformance/clcpp/program_scope_ctors_dtors/CMakeLists.txt
deleted file mode 100644
index fd36d3006e..0000000000
--- a/test_conformance/clcpp/program_scope_ctors_dtors/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_PROGRAM_SCOPE_CTORS_DTORS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/program_scope_ctors_dtors/common.hpp b/test_conformance/clcpp/program_scope_ctors_dtors/common.hpp
deleted file mode 100644
index 9eb17f9212..0000000000
--- a/test_conformance/clcpp/program_scope_ctors_dtors/common.hpp
+++ /dev/null
@@ -1,283 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_PS_CTORS_DTORS_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_PS_CTORS_DTORS_COMMON_HPP
-
-#include <algorithm>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#define RUN_PS_CTORS_DTORS_TEST_MACRO(TEST_CLASS) \
-    last_error = run_ps_ctor_dtor_test(  \
-        device, context, queue, count, TEST_CLASS \
-    );  \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-// Base class for all tests for kernels with program scope object with
-// non-trivial ctors and/or dtors
-struct ps_ctors_dtors_test_base : public detail::base_func_type<cl_uint>
-{
-    // ctor is true, if and only if OpenCL program of this test contains program
-    // scope variable with non-trivial ctor.
-    // dtor is true, if and only if OpenCL program of this test contains program
-    // scope variable with non-trivial dtor.
-    ps_ctors_dtors_test_base(const bool ctor,
-                             const bool dtor)
-        : m_ctor(ctor), m_dtor(dtor)
-    {
-
-    }
-    virtual ~ps_ctors_dtors_test_base() { };
-    // Returns test name
-    virtual std::string str() = 0;
-    // Returns OpenCL program source
-    virtual std::string generate_program() = 0;
-    // Returns kernel names IN ORDER
-    virtual std::vector<std::string> get_kernel_names()
-    {
-        // Typical case, that is, only one kernel
-        return { this->get_kernel_name() };
-    }
-    // Returns value that is expected to be in output_buffer[i]
-    virtual cl_uint operator()(size_t i) = 0;
-    // Executes kernels
-    // Typical case: execute every kernel once, every kernel has only
-    // one argument, that is, output buffer
-    virtual cl_int execute(const std::vector<cl_kernel>& kernels,
-                           cl_mem& output_buffer,
-                           cl_command_queue& queue,
-                           size_t work_size)
-    {
-        cl_int err;
-        for(auto& k : kernels)
-        {
-            err = clSetKernelArg(k, 0, sizeof(output_buffer), &output_buffer);
-            RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-            err = clEnqueueNDRangeKernel(
-                queue, k, 1,
-                NULL, &work_size, NULL,
-                0, NULL, NULL
-            );
-            RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-        }
-        return err;
-    }
-    // This method check if queries for CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT
-    // and CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT using clGetProgramInfo()
-    // return correct values
-    virtual cl_int ctors_dtors_present_queries(cl_program program)
-    {
-        cl_int error = CL_SUCCESS;
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-            return error;
-        #else
-            // CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT cl_bool
-            // This indicates that the program object contains non-trivial constructor(s) that will be
-            // executed by runtime before any kernel from the program is executed.
-
-            // CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT cl_bool
-            // This indicates that the program object contains non-trivial destructor(s) that will be
-            // executed by runtime when program is destroyed.
-
-            // CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT
-            cl_bool ctors_present;
-            size_t cl_bool_size;
-            error = clGetProgramInfo(
-                program,
-                CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT,
-                sizeof(cl_bool),
-                static_cast<void*>(&ctors_present),
-                &cl_bool_size
-            );
-            RETURN_ON_CL_ERROR(error, "clGetProgramInfo")
-            if(cl_bool_size != sizeof(cl_bool))
-            {
-                error = -1;
-                CHECK_ERROR_MSG(
-                    error,
-                    "Test failed, param_value_size_ret != sizeof(cl_bool) (%lu != %lu).\n",
-                    cl_bool_size,
-                    sizeof(cl_bool)
-                );
-            }
-
-            // CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT
-            cl_bool dtors_present = 0;
-            error = clGetProgramInfo(
-                program,
-                CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT,
-                sizeof(cl_bool),
-                static_cast<void*>(&ctors_present),
-                &cl_bool_size
-            );
-            RETURN_ON_CL_ERROR(error, "clGetProgramInfo")
-            if(cl_bool_size != sizeof(cl_bool))
-            {
-                error = -1;
-                CHECK_ERROR_MSG(
-                    error,
-                    "Test failed, param_value_size_ret != sizeof(cl_bool) (%lu != %lu).\n",
-                    cl_bool_size,
-                    sizeof(cl_bool)
-                );
-            }
-
-            // check constructors
-            if(m_ctor && ctors_present != CL_TRUE)
-            {
-                error = -1;
-                CHECK_ERROR_MSG(
-                    error,
-                    "Test failed, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT: 0, should be: 1.\n"
-                );
-            }
-            else if(!m_ctor && ctors_present == CL_TRUE)
-            {
-                error = -1;
-                CHECK_ERROR_MSG(
-                    error,
-                    "Test failed, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT: 1, should be: 0.\n"
-                );
-            }
-
-            // check destructors
-            if(m_dtor && dtors_present != CL_TRUE)
-            {
-                error = -1;
-                CHECK_ERROR_MSG(
-                    error,
-                    "Test failed, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT: 0, should be: 1.\n"
-                );
-            }
-            else if(!m_dtor && dtors_present == CL_TRUE)
-            {
-                error = -1;
-                CHECK_ERROR_MSG(
-                    error,
-                    "Test failed, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT: 1, should be: 0.\n"
-                );
-            }
-            return error;
-        #endif
-    }
-
-private:
-    bool m_ctor;
-    bool m_dtor;
-};
-
-template <class ps_ctor_dtor_test>
-int run_ps_ctor_dtor_test(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, ps_ctor_dtor_test op)
-{
-    cl_mem buffers[1];
-    cl_program program;
-    std::vector<cl_kernel> kernels;
-    size_t work_size[1];
-    cl_int err;
-
-    std::string code_str = op.generate_program();
-    std::vector<std::string> kernel_names = op.get_kernel_names();
-    if(kernel_names.empty())
-    {
-        RETURN_ON_ERROR_MSG(-1, "No kernel to run");
-    }
-    kernels.resize(kernel_names.size());
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0]);
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0], "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#else
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0]);
-    RETURN_ON_ERROR(err)
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#endif
-
-    work_size[0] = count;
-    // host output vector
-    std::vector<cl_uint> output = generate_output<cl_uint>(work_size[0], 9999);
-
-    // device output buffer
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    // Execute test
-    err = op.execute(kernels, buffers[0], queue, work_size[0]);
-    RETURN_ON_ERROR(err)
-
-    // Check if queries returns correct values
-    err = op.ctors_dtors_present_queries(program);
-    RETURN_ON_ERROR(err);
-
-    // Release kernels and program
-    // Destructors should be called now
-    for(auto& k : kernels)
-    {
-        err = clReleaseKernel(k);
-        RETURN_ON_CL_ERROR(err, "clReleaseKernel");
-    }
-    err = clReleaseProgram(program);
-    RETURN_ON_CL_ERROR(err, "clReleaseProgram");
-
-    // Finish
-    err = clFinish(queue);
-    RETURN_ON_CL_ERROR(err, "clFinish");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    // Check output values
-    for(size_t i = 0; i < output.size(); i++)
-    {
-        cl_uint v = op(i);
-        if(!(are_equal(v, output[i], detail::make_value<cl_uint>(0), op)))
-        {
-            RETURN_ON_ERROR_MSG(-1,
-                "test_%s(%s) failed. Expected: %s, got: %s", op.str().c_str(), type_name<cl_uint>().c_str(),
-                format_value(v).c_str(), format_value(output[i]).c_str()
-            );
-        }
-    }
-    log_info("test_%s(%s) passed\n", op.str().c_str(), type_name<cl_uint>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_PS_CTORS_DTORS_COMMON_HPP
diff --git a/test_conformance/clcpp/program_scope_ctors_dtors/main.cpp b/test_conformance/clcpp/program_scope_ctors_dtors/main.cpp
deleted file mode 100644
index 78b077394b..0000000000
--- a/test_conformance/clcpp/program_scope_ctors_dtors/main.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_ctors_dtors.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/program_scope_ctors_dtors/test_ctors_dtors.hpp b/test_conformance/clcpp/program_scope_ctors_dtors/test_ctors_dtors.hpp
deleted file mode 100644
index c9ac0821ce..0000000000
--- a/test_conformance/clcpp/program_scope_ctors_dtors/test_ctors_dtors.hpp
+++ /dev/null
@@ -1,324 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_PS_CTORS_DTORS_TEST_CTORS_DTORS_HPP
-#define TEST_CONFORMANCE_CLCPP_PS_CTORS_DTORS_TEST_CTORS_DTORS_HPP
-
-#include "common.hpp"
-
-// Test for program scope variable with non-trivial ctor
-struct ps_ctor_test : public ps_ctors_dtors_test_base
-{
-    ps_ctor_test(const cl_uint test_value)
-        : ps_ctors_dtors_test_base(true, false),
-          m_test_value(test_value)
-    {
-
-    }
-    
-    std::string str()
-    {
-        return "ps_ctor_test";
-    }
-
-    std::vector<std::string> get_kernel_names()
-    {
-        return { 
-            this->str() + "_set",
-            this->str() + "_read"
-        };
-    }
-
-    // Returns value that is expected to be in output_buffer[i]
-    cl_uint operator()(size_t i)
-    {
-        if(i % 2 == 0)
-            return m_test_value;
-        return cl_uint(0xbeefbeef);
-    }
-
-    // In 1st kernel 0th work-tem sets member m_x of program scope variable global_var to
-    // m_test_value and m_y to uint(0xbeefbeef),
-    // In 2nd kernel:
-    // 1) if global id is even, then work-item reads global_var.m_x and writes it to output[its-global-id];
-    // 2) otherwise, work-item reads global_var.m_y and writes it to output[its-global-id].
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_names()[0] + "(global uint *output)\n"
-                "{\n"
-                "   size_t gid = get_global_id(0);\n"
-                "   output[gid] = 0xbeefbeef;\n"
-                "}\n"
-                "__kernel void " + this->get_kernel_names()[1] + "(global uint *output)\n"
-                "{\n"
-                "   size_t gid = get_global_id(0);\n"
-                "   if(gid % 2 == 0)\n"
-                "      output[gid] = " + std::to_string(m_test_value) + ";\n"
-                "}\n";
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                // struct template
-                "template<class T>\n"    
-                "struct ctor_test_class_base {\n"
-                // non-trivial ctor
-                "   ctor_test_class_base(T x) { m_x = x;};\n"
-                "   T m_x;\n"
-                "};\n"
-                // struct template
-                "template<class T>\n"    
-                "struct ctor_test_class : public ctor_test_class_base<T> {\n"
-                // non-trivial ctor
-                "   ctor_test_class(T x, T y) : ctor_test_class_base<T>(x), m_y(y) { };\n"
-                "   T m_y;\n"
-                "};\n"
-                // global scope program variables
-                "ctor_test_class<uint> global_var(uint(0), uint(0));\n"
-
-                "__kernel void " + this->get_kernel_names()[0] + "(global_ptr<uint[]> output)\n"
-                "{\n"
-                "   size_t gid = get_global_id(0);\n"
-                "   if(gid == 0) {\n"
-                "       global_var.m_x = " + std::to_string(m_test_value) + ";\n"  
-                "       global_var.m_y = 0xbeefbeef;\n"  
-                "   }\n"
-                "}\n"
-
-                "__kernel void " + this->get_kernel_names()[1] + "(global_ptr<uint[]> output)\n"
-                "{\n"
-                "   size_t gid = get_global_id(0);\n"
-                "   if(gid % 2 == 0)\n"
-                "      output[gid] = global_var.m_x;\n"
-                "   else\n"
-                "      output[gid] = global_var.m_y;\n"
-                "}\n";        
-        #endif
-    }
-
-private:
-    cl_uint m_test_value;
-};
-
-// Test for program scope variable with non-trivial dtor
-struct ps_dtor_test : public ps_ctors_dtors_test_base
-{
-    ps_dtor_test(const cl_uint test_value)
-        : ps_ctors_dtors_test_base(false, true),
-          m_test_value(test_value)
-    {
-
-    }
-    
-    std::string str()
-    {
-        return "ps_dtor_test";
-    }
-
-    // Returns value that is expected to be in output_buffer[i]
-    cl_uint operator()(size_t i)
-    {
-        if(i % 2 == 0)
-            return m_test_value;
-        return 1;
-    }
-
-    // In 1st kernel 0th work-item saves pointer to output buffer and its size in program scope
-    // variable global_var, it also sets counter to 1;
-    // After global_var is destroyed all even elements of output buffer should equal m_test_value, 
-    // and all odd should equal 1.
-    // If odd elements of output buffer are >1 it means dtor was executed more than once.
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global uint *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    if(gid % 2 == 0)\n"
-                "        output[gid] = " + std::to_string(m_test_value) + ";\n"
-                "    else\n"
-                "        output[gid] = 1;\n"
-                "}\n";
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                // struct template
-                "template<class T>\n"
-                "struct dtor_test_class_base {\n"
-                // non-trivial dtor
-                // set all odd elements in buffer to counter
-                "   ~dtor_test_class_base() {\n"
-                "       for(size_t i = 1; i < this->size; i+=2)\n"
-                "       {\n"
-                "           this->buffer[i] = counter;\n"
-                "       }\n"
-                "       counter++;\n"
-                "   };\n"
-                "   global_ptr<uint[]> buffer;\n"
-                "   size_t size;\n"
-                "   T counter;\n"
-                "};\n" 
-                // struct   
-                "struct dtor_test_class : public dtor_test_class_base<uint> {\n"
-                // non-trivial dtor
-                // set all values in buffer to m_test_value
-                "   ~dtor_test_class() {\n"
-                "       for(size_t i = 0; i < this->size; i+=2)\n"
-                "           this->buffer[i] = " + std::to_string(m_test_value) + ";\n"
-                "   };\n"
-                "};\n" 
-                // global scope program variable
-                "dtor_test_class global_var;\n"
-
-                // When global_var is being destroyed, first dtor ~dtor_test_class is called,
-                // and then ~dtor_test_class_base is called.
-
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output)\n"
-                "{\n"
-                "   size_t gid = get_global_id(0);\n"
-                // set buffer and size in global var
-                "   if(gid == 0){\n"
-                "       global_var.buffer = output;\n"
-                "       global_var.size = get_global_size(0);\n"
-                "       global_var.counter = 1;\n"
-                "   }\n"
-                "}\n";
-        #endif
-    }
-
-private:
-    cl_uint m_test_value;
-};
-
-// Test for program scope variable with both non-trivial ctor
-// and non-trivial dtor
-struct ps_ctor_dtor_test : public ps_ctors_dtors_test_base
-{
-    ps_ctor_dtor_test(const cl_uint test_value)
-        : ps_ctors_dtors_test_base(false, true),
-          m_test_value(test_value)
-    {
-
-    }
-    
-    std::string str()
-    {
-        return "ps_ctor_dtor_test";
-    }
-
-    // Returns value that is expected to be in output_buffer[i]
-    cl_uint operator()(size_t i)
-    {
-        return m_test_value;
-    }
-
-    // In 1st kernel 0th work-item saves pointer to output buffer and its size in program scope
-    // variable global_var.
-    // After global_var is destroyed all even elements of output buffer should equal m_test_value, 
-    // and all odd should equal 1.
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global uint *output)\n"
-                "{\n"
-                "    size_t gid = get_global_id(0);\n"
-                "    output[gid] = " + std::to_string(m_test_value) + ";\n"
-                "}\n";
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_array>\n"
-                "using namespace cl;\n"
-                // struct template
-                "template<class T>\n"    
-                "struct ctor_test_class {\n"
-                // non-trivial ctor
-                "   ctor_test_class(T value) : m_value(value) { };\n"
-                "   T m_value;\n"
-                "};\n\n"
-                // struct   
-                "struct ctor_dtor_test_class {\n"
-                // non-trivial ctor
-                "   ctor_dtor_test_class(uint value) : ctor_test(value) { } \n"
-                // non-trivial dtor
-                // set all values in buffer to m_test_value
-                "   ~ctor_dtor_test_class() {\n"
-                "       for(size_t i = 0; i < this->size; i++)\n"
-                "       {\n"
-                "          this->buffer[i] = ctor_test.m_value;\n"            
-                "       }\n"
-                "   };\n"
-                "   ctor_test_class<uint> ctor_test;\n"
-                "   global_ptr<uint[]> buffer;\n"
-                "   size_t size;\n"
-                "};\n" 
-                // global scope program variable
-                "ctor_dtor_test_class global_var(" + std::to_string(m_test_value) + ");\n"
-
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output)\n"
-                "{\n"
-                "   size_t gid = get_global_id(0);\n"
-                // set buffer and size in global var
-                "   if(gid == 0){\n"
-                "       global_var.buffer = output;\n"
-                "       global_var.size = get_global_size(0);\n"
-                "   }\n"
-                "}\n";
-        #endif
-    }
-
-private:
-    cl_uint m_test_value;
-};
-
-// This contains tests for program scope (global) constructors and destructors, more
-// detailed tests are also in clcpp/api.
-AUTO_TEST_CASE(test_program_scope_ctors_dtors)
-(cl_device_id device, cl_context context, cl_command_queue queue, int count)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    RUN_PS_CTORS_DTORS_TEST_MACRO(ps_ctor_test(0xdeadbeefU))
-    RUN_PS_CTORS_DTORS_TEST_MACRO(ps_dtor_test(0xbeefdeadU))
-    RUN_PS_CTORS_DTORS_TEST_MACRO(ps_ctor_dtor_test(0xdeaddeadU))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_PS_CTORS_DTORS_TEST_CTORS_DTORS_HPP
diff --git a/test_conformance/clcpp/reinterpret/CMakeLists.txt b/test_conformance/clcpp/reinterpret/CMakeLists.txt
deleted file mode 100644
index ed02c56fba..0000000000
--- a/test_conformance/clcpp/reinterpret/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_REINTERPRET)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/reinterpret/as_type.hpp b/test_conformance/clcpp/reinterpret/as_type.hpp
deleted file mode 100644
index da088cfab6..0000000000
--- a/test_conformance/clcpp/reinterpret/as_type.hpp
+++ /dev/null
@@ -1,223 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_REINTERPRET_AS_TYPE_HPP
-#define TEST_CONFORMANCE_CLCPP_REINTERPRET_AS_TYPE_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include <cstring>
-
-
-template<class IN1, class OUT1>
-struct as_type : public unary_func<IN1, OUT1>
-{
-    static_assert(sizeof(IN1) == sizeof(OUT1), "It is an error to use the as_type<T> operator to reinterpret data to a type of a different number of bytes");
-
-    std::string str()
-    {
-        return "as_type";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_reinterpret>\n";
-    }
-
-    OUT1 operator()(const IN1& x)
-    {
-        return *reinterpret_cast<const OUT1*>(&x);
-    }
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class in_type, class out_type>
-std::string generate_kernel_as_type(func_type func)
-{
-    std::string in1_value = "input[gid]";
-    std::string function_call = "as_" + type_name<out_type>() + "(" + in1_value + ");";
-    return
-        "__kernel void test_" + func.str() + "(global " + type_name<in_type>() + " *input, global " + type_name<out_type>() + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#else
-template <class func_type, class in_type, class out_type>
-std::string generate_kernel_as_type(func_type func)
-{
-    std::string headers = func.headers();
-    std::string in1_value = "input[gid]";
-    std::string function_call = "as_type<" + type_name<out_type>() + ">(" + in1_value + ")";
-    return
-        "" + func.defs() +
-        "" + headers +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_" + func.str() + "(global_ptr<" + type_name<in_type>() +  "[]> input,"
-                                              "global_ptr<" + type_name<out_type>() + "[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#endif
-
-template<class INPUT, class OUTPUT, class as_type_op>
-bool verify_as_type(const std::vector<INPUT> &in, const std::vector<OUTPUT> &out, as_type_op op)
-{
-    // When the operand and result type contain a different number of elements, the result is implementation-defined,
-    // i.e. any result is correct
-    if (vector_size<INPUT>::value == vector_size<OUTPUT>::value)
-    {
-        for (size_t i = 0; i < in.size(); i++)
-        {
-            auto expected = op(in[i]);
-            if (std::memcmp(&expected, &out[i], sizeof(expected)) != 0)
-            {
-                print_error_msg(expected, out[i], i, op);
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-template <class as_type_op>
-int test_as_type_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, as_type_op op)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int error;
-
-    typedef typename as_type_op::in_type INPUT;
-    typedef typename as_type_op::out_type OUTPUT;
-
-    // Don't run test for unsupported types
-    if (!(type_supported<INPUT>(device) && type_supported<OUTPUT>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_as_type<as_type_op, INPUT, OUTPUT>(op);
-    std::string kernel_name("test_"); kernel_name += op.str();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(error)
-#else
-    error = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(error)
-#endif
-
-    std::vector<INPUT> input = generate_input<INPUT>(count, op.min1(), op.max1(), op.in_special_cases());
-    std::vector<OUTPUT> output = generate_output<OUTPUT>(count);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(INPUT) * input.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(OUTPUT) * output.size(), NULL, &error);
-    RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-    error = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(INPUT) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueWriteBuffer")
-
-    error = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-    error = clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-    work_size[0] = count;
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-    error = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(OUTPUT) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-    if (!verify_as_type(input, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1, "test_%s %s(%s) failed", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-    }
-    log_info("test_%s %s(%s) passed\n", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_as_type)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-#define TEST_AS_TYPE_MACRO(TYPE1, TYPE2) \
-    last_error = test_as_type_func( \
-        device, context, queue, n_elems, as_type<TYPE1, TYPE2>() \
-    ); \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-    TEST_AS_TYPE_MACRO(cl_int, cl_int)
-    TEST_AS_TYPE_MACRO(cl_uint, cl_int)
-    TEST_AS_TYPE_MACRO(cl_int, cl_ushort2)
-    TEST_AS_TYPE_MACRO(cl_uchar, cl_uchar)
-    TEST_AS_TYPE_MACRO(cl_char4, cl_ushort2)
-    TEST_AS_TYPE_MACRO(cl_uchar16, cl_char16)
-    TEST_AS_TYPE_MACRO(cl_short8, cl_uchar16)
-    TEST_AS_TYPE_MACRO(cl_float4, cl_uint4)
-    TEST_AS_TYPE_MACRO(cl_float16, cl_int16)
-    TEST_AS_TYPE_MACRO(cl_long2, cl_float4)
-    TEST_AS_TYPE_MACRO(cl_ulong, cl_long)
-    TEST_AS_TYPE_MACRO(cl_ulong16, cl_double16)
-    TEST_AS_TYPE_MACRO(cl_uchar16, cl_double2)
-    TEST_AS_TYPE_MACRO(cl_ulong4, cl_short16)
-
-#undef TEST_AS_TYPE_MACRO
-
-    if (error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-
-#endif // TEST_CONFORMANCE_CLCPP_REINTERPRET_AS_TYPE_HPP
diff --git a/test_conformance/clcpp/reinterpret/main.cpp b/test_conformance/clcpp/reinterpret/main.cpp
deleted file mode 100644
index 06d7056f3e..0000000000
--- a/test_conformance/clcpp/reinterpret/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "as_type.hpp"
-
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/relational_funcs/CMakeLists.txt b/test_conformance/clcpp/relational_funcs/CMakeLists.txt
deleted file mode 100644
index 3a8389cba9..0000000000
--- a/test_conformance/clcpp/relational_funcs/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_RELATIONAL_FUNCS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/relational_funcs/common.hpp b/test_conformance/clcpp/relational_funcs/common.hpp
deleted file mode 100644
index a13f7bacd6..0000000000
--- a/test_conformance/clcpp/relational_funcs/common.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMMON_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include <type_traits>
-#include <cmath>
-
-template<class IN1, class IN2, class IN3, class OUT1, class F>
-OUT1 perform_function(const IN1& in1, const IN2& in2, const IN3& in3, F func, typename std::enable_if<is_vector_type<OUT1>::value>::type* = 0)
-{
-    OUT1 result;
-    for(size_t i = 0; i < vector_size<OUT1>::value; i++)
-    {
-        result.s[i] = func(in1.s[i], in2.s[i], in3.s[i]);
-    }
-    return result;
-}
-
-template<class IN1, class IN2, class IN3, class OUT1, class F>
-OUT1 perform_function(const IN1& in1, const IN2& in2, const IN3& in3, F func, typename std::enable_if<!is_vector_type<OUT1>::value>::type* = 0)
-{
-    OUT1 result = func(in1, in2, in3);
-    return result;
-}
-
-
-template<class IN1, class IN2, class OUT1, class F>
-OUT1 perform_function(const IN1& in1, const IN2& in2, F func, typename std::enable_if<is_vector_type<OUT1>::value>::type* = 0)
-{
-    OUT1 result;
-    for(size_t i = 0; i < vector_size<OUT1>::value; i++)
-    {
-        result.s[i] = func(in1.s[i], in2.s[i]);
-    }
-    return result;
-}
-
-template<class IN1, class IN2, class OUT1, class F>
-OUT1 perform_function(const IN1& in1, const IN2& in2, F func, typename std::enable_if<!is_vector_type<OUT1>::value>::type* = 0)
-{
-    OUT1 result = func(in1, in2);
-    return result;
-}
-
-template<class IN1, class OUT1, class F>
-OUT1 perform_function(const IN1& in1, F func, typename std::enable_if<is_vector_type<OUT1>::value>::type* = 0)
-{
-    OUT1 result;
-    for(size_t i = 0; i < vector_size<OUT1>::value; i++)
-    {
-        result.s[i] = func(in1.s[i]);
-    }
-    return result;
-}
-
-template<class IN1, class OUT1, class F>
-OUT1 perform_function(const IN1& in1, F func, typename std::enable_if<!is_vector_type<OUT1>::value>::type* = 0)
-{
-    OUT1 result = func(in1);
-    return result;
-}
-
-template<class IN1>
-cl_int perform_all_function(const IN1& in1, typename std::enable_if<is_vector_type<IN1>::value>::type* = 0)
-{
-    cl_int result = 1;
-    for(size_t i = 0; i < vector_size<IN1>::value; i++)
-    {
-        result = (in1.s[i] != 0) ? result : cl_int(0);
-    }
-    return result;
-}
-
-cl_int perform_all_function(const cl_int& in1, typename std::enable_if<!is_vector_type<cl_int>::value>::type* = 0)
-{
-    return (in1 != 0) ? cl_int(1) : cl_int(0);
-}
-
-template<class IN1>
-cl_int perform_any_function(const IN1& in1, typename std::enable_if<is_vector_type<IN1>::value>::type* = 0)
-{
-    cl_int result = 0;
-    for(size_t i = 0; i < vector_size<IN1>::value; i++)
-    {
-        result = (in1.s[i] != 0) ? cl_int(1) : result;
-    }
-    return result;
-}
-
-cl_int perform_any_function(const cl_int& in1, typename std::enable_if<!is_vector_type<cl_int>::value>::type* = 0)
-{
-    return (in1 != 0) ? cl_int(1) : cl_int(0);
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMMON_HPP
diff --git a/test_conformance/clcpp/relational_funcs/comparison_funcs.hpp b/test_conformance/clcpp/relational_funcs/comparison_funcs.hpp
deleted file mode 100644
index 980d67c843..0000000000
--- a/test_conformance/clcpp/relational_funcs/comparison_funcs.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMPARISON_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMPARISON_FUNCS_HPP
-
-#include "common.hpp"
-
-// This marco creates a class wrapper for comparision function we want to test.
-#define DEF_COMPARISION_FUNC(CLASS_NAME, FUNC_NAME, HOST_FUNC_EXPRESSION) \
-template <cl_int N /* Vector size */> \
-struct CLASS_NAME : public binary_func< \
-                                    typename make_vector_type<cl_float, N>::type, /* create cl_floatN type */ \
-                                    typename make_vector_type<cl_float, N>::type, /* create cl_floatN type */ \
-                                    typename make_vector_type<cl_int, N>::type /* create cl_intN type */ \
-                                 > \
-{ \
-    typedef typename make_vector_type<cl_float, N>::type input_type; \
-    typedef typename make_vector_type<cl_int, N>::type result_type; \
-    \
-    std::string str() \
-    { \
-        return #FUNC_NAME; \
-    } \
-    \
-    std::string headers() \
-    { \
-        return "#include <opencl_relational>\n"; \
-    } \
-    \
-    result_type operator()(const input_type& x, const input_type& y) \
-    {    \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return perform_function<input_type, input_type, result_type>( \
-            x, y, \
-            [](const SCALAR& a, const SCALAR& b) \
-            { \
-                if(HOST_FUNC_EXPRESSION) \
-                { \
-                    return cl_int(1); \
-                } \
-                return cl_int(0); \
-            } \
-        ); \
-    } \
-    \
-    bool is_out_bool() \
-    { \
-        return true; \
-    } \
-    \
-    input_type min1() \
-    { \
-        return detail::def_limit<input_type>(-10000.0f); \
-    } \
-    \
-    input_type max1() \
-    { \
-        return detail::def_limit<input_type>(10000.0f); \
-    } \
-    \
-    input_type min2() \
-    { \
-        return detail::def_limit<input_type>(-10000.0f); \
-    } \
-    \
-    input_type max2() \
-    { \
-        return detail::def_limit<input_type>(10000.0f); \
-    } \
-    \
-    std::vector<input_type> in1_special_cases() \
-    { \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return {  \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(-std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::quiet_NaN()), \
-            detail::make_value<input_type>(0.0f), \
-            detail::make_value<input_type>(-0.0f) \
-        }; \
-    } \
-    \
-    std::vector<input_type> in2_special_cases() \
-    { \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return {  \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(-std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::quiet_NaN()), \
-            detail::make_value<input_type>(0.0f), \
-            detail::make_value<input_type>(-0.0f) \
-        }; \
-    } \
-};
-
-DEF_COMPARISION_FUNC(comparison_func_isequal, isequal, (a == b))
-DEF_COMPARISION_FUNC(comparison_func_isnotequal, isnotequal, !(a == b))
-DEF_COMPARISION_FUNC(comparison_func_isgreater, isgreater, (std::isgreater)(a, b))
-DEF_COMPARISION_FUNC(comparison_func_isgreaterequal, isgreaterequal, ((std::isgreater)(a, b) || a == b))
-DEF_COMPARISION_FUNC(comparison_func_isless, isless, (std::isless)(a, b))
-DEF_COMPARISION_FUNC(comparison_func_islessequal, islessequal, ((std::isless)(a, b) || a == b))
-DEF_COMPARISION_FUNC(comparison_func_islessgreater, islessgreater, ((a < b) || (a > b)))
-
-#undef DEF_COMPARISION_FUNC
-
-AUTO_TEST_CASE(test_relational_comparison_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-// Helper macro, so we don't have to repreat the same code.  
-#define TEST_BINARY_REL_FUNC_MACRO(CLASS_NAME) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<1>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<2>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<4>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<8>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<16>())
-
-    TEST_BINARY_REL_FUNC_MACRO(comparison_func_isequal)
-    TEST_BINARY_REL_FUNC_MACRO(comparison_func_isnotequal)
-    TEST_BINARY_REL_FUNC_MACRO(comparison_func_isgreater)
-    TEST_BINARY_REL_FUNC_MACRO(comparison_func_isgreaterequal)
-    TEST_BINARY_REL_FUNC_MACRO(comparison_func_isless)
-    TEST_BINARY_REL_FUNC_MACRO(comparison_func_islessequal)
-    TEST_BINARY_REL_FUNC_MACRO(comparison_func_islessgreater)
-
-#undef TEST_BINARY_REL_FUNC_MACRO
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMPARISON_FUNCS_HPP
diff --git a/test_conformance/clcpp/relational_funcs/main.cpp b/test_conformance/clcpp/relational_funcs/main.cpp
deleted file mode 100644
index 2b72d3d24f..0000000000
--- a/test_conformance/clcpp/relational_funcs/main.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "comparison_funcs.hpp"
-#include "select_funcs.hpp"
-#include "test_funcs.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/relational_funcs/select_funcs.hpp b/test_conformance/clcpp/relational_funcs/select_funcs.hpp
deleted file mode 100644
index 2e6f6bdd58..0000000000
--- a/test_conformance/clcpp/relational_funcs/select_funcs.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_SELECT_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_SELECT_FUNCS_HPP
-
-#include "common.hpp"
-
-template <class IN1, cl_int N /* Vector size */>
-struct select_func_select : public ternary_func<
-                                    typename make_vector_type<IN1, N>::type, /* create IN1N type */
-                                    typename make_vector_type<IN1, N>::type, /* create IN1N type */
-                                    typename make_vector_type<cl_int, N>::type, /* create cl_intN type */
-                                    typename make_vector_type<IN1, N>::type /* create IN1N type */
-                                 >
-{
-    typedef typename make_vector_type<IN1, N>::type input1_type;
-    typedef typename make_vector_type<IN1, N>::type input2_type;
-    typedef typename make_vector_type<cl_int, N>::type input3_type;
-    typedef typename make_vector_type<IN1, N>::type result_type;
-   
-    std::string str()
-    {
-        return "select";
-    }
-   
-    std::string headers()
-    {
-        return "#include <opencl_relational>\n";
-    }
-   
-    result_type operator()(const input1_type& x, const input2_type& y, const input3_type& z)
-    {   
-        typedef typename scalar_type<input1_type>::type SCALAR1;
-        typedef typename scalar_type<input2_type>::type SCALAR2;
-        typedef typename scalar_type<input3_type>::type SCALAR3;
-
-        return perform_function<input1_type, input2_type, input3_type, result_type>(
-            x, y, z,
-            [](const SCALAR1& a, const SCALAR2& b, const SCALAR3& c)
-            {
-                    return (c != 0) ? b : a;
-            }
-        );
-    }
-
-    bool is_in3_bool()
-    {
-        return true;
-    }
-   
-    std::vector<input3_type> in3_special_cases()
-    {
-        return { 
-            detail::make_value<input3_type>(0),
-            detail::make_value<input3_type>(1),
-            detail::make_value<input3_type>(12),
-            detail::make_value<input3_type>(-12)
-        };
-    }
-};
-
-template <class IN1, cl_int N /* Vector size */>
-struct select_func_bitselect : public ternary_func<
-                                    typename make_vector_type<IN1, N>::type, /* create IN1N type */
-                                    typename make_vector_type<IN1, N>::type, /* create IN1N type */
-                                    typename make_vector_type<IN1, N>::type, /* create cl_intN type */
-                                    typename make_vector_type<IN1, N>::type /* create IN1N type */
-                                 >
-{
-    typedef typename make_vector_type<IN1, N>::type input1_type;
-    typedef typename make_vector_type<IN1, N>::type input2_type;
-    typedef typename make_vector_type<IN1, N>::type input3_type;
-    typedef typename make_vector_type<IN1, N>::type result_type;
-   
-    std::string str()
-    {
-        return "bitselect";
-    }
-   
-    std::string headers()
-    {
-        return "#include <opencl_relational>\n";
-    }
-   
-    result_type operator()(const input1_type& x, const input2_type& y, const input3_type& z)
-    {  
-        static_assert(
-            std::is_integral<IN1>::value,
-            "bitselect test is implemented only for integers."
-        ); 
-        static_assert(
-            std::is_unsigned<IN1>::value,
-            "IN1 type should be unsigned, bitwise operations on signed int may cause problems."
-        );
-        typedef typename scalar_type<input1_type>::type SCALAR1;
-        typedef typename scalar_type<input2_type>::type SCALAR2;
-        typedef typename scalar_type<input3_type>::type SCALAR3;
-
-        return perform_function<input1_type, input2_type, input3_type, result_type>(
-            x, y, z,
-            [](const SCALAR1& a, const SCALAR2& b, const SCALAR3& c)
-            {
-                return (~c & a) | (c & b);
-            }
-        );
-    }
-};
-
-AUTO_TEST_CASE(test_relational_select_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-// Tests for select(gentype a, gentype b, booln c) are not run in USE_OPENCLC_KERNELS 
-// mode, because this functions in OpenCL C requires different reference functions on host
-// compared to their equivalent in OpenCL C++.
-// (In OpenCL C the result of select(), when gentype is vector type, is based on the most
-// significant bits of c components)
-#ifndef USE_OPENCLC_KERNELS
-    // gentype select(gentype a, gentype b, booln c)
-    TEST_TERNARY_FUNC_MACRO((select_func_select<cl_uint,  1>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_select<cl_float, 2>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_select<cl_short, 4>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_select<cl_uint,  8>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_select<cl_uint,  16>()))
-#else
-    log_info("WARNING:\n\tTests for select(gentype a, gentype b, booln c) are not run in USE_OPENCLC_KERNELS mode\n");
-#endif
-
-    // gentype bitselect(gentype a, gentype b, gentype c)
-    TEST_TERNARY_FUNC_MACRO((select_func_bitselect<cl_uint, 1>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_bitselect<cl_ushort, 2>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_bitselect<cl_uchar, 4>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_bitselect<cl_ushort, 8>()))
-    TEST_TERNARY_FUNC_MACRO((select_func_bitselect<cl_uint, 16>()))
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_SELECT_FUNCS_HPP
diff --git a/test_conformance/clcpp/relational_funcs/test_funcs.hpp b/test_conformance/clcpp/relational_funcs/test_funcs.hpp
deleted file mode 100644
index 77e3d871c1..0000000000
--- a/test_conformance/clcpp/relational_funcs/test_funcs.hpp
+++ /dev/null
@@ -1,336 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_TEST_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_TEST_FUNCS_HPP
-
-#include "common.hpp"
-
-// This marco creates a class wrapper for unary test function we want to test.
-#define DEF_UNARY_TEST_FUNC(CLASS_NAME, FUNC_NAME, HOST_FUNC_EXPRESSION) \
-template <cl_int N /* Vector size */> \
-struct CLASS_NAME : public unary_func< \
-                                    typename make_vector_type<cl_float, N>::type, /* create cl_floatN type */ \
-                                    typename make_vector_type<cl_int, N>::type /* create cl_intN type */ \
-                                 > \
-{ \
-    typedef typename make_vector_type<cl_float, N>::type input_type; \
-    typedef typename make_vector_type<cl_int, N>::type result_type; \
-    \
-    std::string str() \
-    { \
-        return #FUNC_NAME; \
-    } \
-    \
-    std::string headers() \
-    { \
-        return "#include <opencl_relational>\n"; \
-    } \
-    \
-    result_type operator()(const input_type& x) \
-    {    \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return perform_function<input_type, result_type>( \
-            x, \
-            [](const SCALAR& a) \
-            { \
-                if(HOST_FUNC_EXPRESSION) \
-                { \
-                    return cl_int(1); \
-                } \
-                return cl_int(0); \
-            } \
-        ); \
-    } \
-    \
-    bool is_out_bool() \
-    { \
-        return true; \
-    } \
-    \
-    input_type min1() \
-    { \
-        return detail::def_limit<input_type>(-10000.0f); \
-    } \
-    \
-    input_type max1() \
-    { \
-        return detail::def_limit<input_type>(10000.0f); \
-    } \
-    \
-    std::vector<input_type> in1_special_cases() \
-    { \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return {  \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(-std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::quiet_NaN()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::signaling_NaN()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::denorm_min()), \
-            detail::make_value<input_type>(0.0f), \
-            detail::make_value<input_type>(-0.0f) \
-        }; \
-    } \
-};
-
-// This marco creates a class wrapper for binary test function we want to test.
-#define DEF_BINARY_TEST_FUNC(CLASS_NAME, FUNC_NAME, HOST_FUNC_EXPRESSION) \
-template <cl_int N /* Vector size */> \
-struct CLASS_NAME : public binary_func< \
-                                    typename make_vector_type<cl_float, N>::type, /* create cl_floatN type */ \
-                                    typename make_vector_type<cl_float, N>::type, /* create cl_floatN type */ \
-                                    typename make_vector_type<cl_int, N>::type /* create cl_intN type */ \
-                                 > \
-{ \
-    typedef typename make_vector_type<cl_float, N>::type input_type; \
-    typedef typename make_vector_type<cl_int, N>::type result_type; \
-    \
-    std::string str() \
-    { \
-        return #FUNC_NAME; \
-    } \
-    \
-    std::string headers() \
-    { \
-        return "#include <opencl_relational>\n"; \
-    } \
-    \
-    result_type operator()(const input_type& x, const input_type& y) \
-    {    \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return perform_function<input_type, input_type, result_type>( \
-            x, y, \
-            [](const SCALAR& a, const SCALAR& b) \
-            { \
-                if(HOST_FUNC_EXPRESSION) \
-                { \
-                    return cl_int(1); \
-                } \
-                return cl_int(0); \
-            } \
-        ); \
-    } \
-    \
-    bool is_out_bool() \
-    { \
-        return true; \
-    } \
-    \
-    input_type min1() \
-    { \
-        return detail::def_limit<input_type>(-10000.0f); \
-    } \
-    \
-    input_type max1() \
-    { \
-        return detail::def_limit<input_type>(10000.0f); \
-    } \
-    \
-    input_type min2() \
-    { \
-        return detail::def_limit<input_type>(-10000.0f); \
-    } \
-    \
-    input_type max2() \
-    { \
-        return detail::def_limit<input_type>(10000.0f); \
-    } \
-    \
-    std::vector<input_type> in1_special_cases() \
-    { \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return {  \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(-std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::quiet_NaN()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::signaling_NaN()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::denorm_min()), \
-            detail::make_value<input_type>(0.0f), \
-            detail::make_value<input_type>(-0.0f) \
-        }; \
-    } \
-    \
-    std::vector<input_type> in2_special_cases() \
-    { \
-        typedef typename scalar_type<input_type>::type SCALAR; \
-        return {  \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(-std::numeric_limits<SCALAR>::infinity()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::quiet_NaN()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::signaling_NaN()), \
-            detail::make_value<input_type>(std::numeric_limits<SCALAR>::denorm_min()), \
-            detail::make_value<input_type>(0.0f), \
-            detail::make_value<input_type>(-0.0f) \
-        }; \
-    } \
-};
-
-DEF_UNARY_TEST_FUNC(test_func_isfinite, isfinite, (std::isfinite)(a))
-DEF_UNARY_TEST_FUNC(test_func_isinf, isinf, (std::isinf)(a))
-DEF_UNARY_TEST_FUNC(test_func_isnan, isnan, (std::isnan)(a))
-DEF_UNARY_TEST_FUNC(test_func_isnormal, isnormal, (std::isnormal)(a))
-DEF_UNARY_TEST_FUNC(test_func_signbit, signbit , (std::signbit)(a))
-
-DEF_BINARY_TEST_FUNC(test_func_isordered, isordered, !(std::isunordered)(a, b))
-DEF_BINARY_TEST_FUNC(test_func_isunordered, isunordered, (std::isunordered)(a, b))
-
-#undef DEF_UNARY_TEST_FUNC
-#undef DEF_BINARY_TEST_FUNC
-
-template <cl_int N /* Vector size */>
-struct test_func_all : public unary_func<
-                                    typename make_vector_type<cl_int, N>::type, /* create cl_intN type */
-                                    cl_int /* create cl_intN type */
-                                 >
-{
-    typedef typename make_vector_type<cl_int, N>::type input_type;
-    typedef cl_int result_type;
-
-    std::string str()
-    {
-        return "all";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_relational>\n";
-    }
-
-    result_type operator()(const input_type& x)
-    {
-        return perform_all_function(x);
-    }
-
-    bool is_out_bool()
-    {
-        return true;
-    }
-
-    bool is_in1_bool()
-    {
-        return true;
-    }
-
-    std::vector<input_type> in1_special_cases()
-    {
-        return {
-            detail::make_value<input_type>(0),
-            detail::make_value<input_type>(1),
-            detail::make_value<input_type>(12),
-            detail::make_value<input_type>(-12)
-        };
-    }
-};
-
-template <cl_int N /* Vector size */>
-struct test_func_any : public unary_func<
-                                    typename make_vector_type<cl_int, N>::type, /* create cl_intN type */
-                                    cl_int /* create cl_intN type */
-                                 >
-{
-    typedef typename make_vector_type<cl_int, N>::type input_type;
-    typedef cl_int result_type;
-
-    std::string str()
-    {
-        return "any";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_relational>\n";
-    }
-
-    result_type operator()(const input_type& x)
-    {
-        return perform_any_function(x);
-    }
-
-    bool is_out_bool()
-    {
-        return true;
-    }
-
-    bool is_in1_bool()
-    {
-        return true;
-    }
-
-    std::vector<input_type> in1_special_cases()
-    {
-        return {
-            detail::make_value<input_type>(0),
-            detail::make_value<input_type>(1),
-            detail::make_value<input_type>(12),
-            detail::make_value<input_type>(-12)
-        };
-    }
-};
-
-AUTO_TEST_CASE(test_relational_test_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-// Helper macro, so we don't have to repreat the same code.
-#define TEST_UNARY_REL_FUNC_MACRO(CLASS_NAME) \
-    TEST_UNARY_FUNC_MACRO(CLASS_NAME<1>()) \
-    TEST_UNARY_FUNC_MACRO(CLASS_NAME<2>()) \
-    TEST_UNARY_FUNC_MACRO(CLASS_NAME<4>()) \
-    TEST_UNARY_FUNC_MACRO(CLASS_NAME<8>()) \
-    TEST_UNARY_FUNC_MACRO(CLASS_NAME<16>())
-
-    TEST_UNARY_REL_FUNC_MACRO(test_func_isfinite)
-    TEST_UNARY_REL_FUNC_MACRO(test_func_isinf)
-    TEST_UNARY_REL_FUNC_MACRO(test_func_isnan)
-    TEST_UNARY_REL_FUNC_MACRO(test_func_isnormal)
-    TEST_UNARY_REL_FUNC_MACRO(test_func_signbit)
-
-// Tests for all(booln x) and any(booln x) are not run in USE_OPENCLC_KERNELS mode,
-// because those functions in OpenCL C require different reference functions on host
-// compared to their equivalents from OpenCL C++.
-// (In OpenCL C those functions returns true/false based on the most significant bits
-// in any/all component/s of x)
-#ifndef USE_OPENCLC_KERNELS
-    TEST_UNARY_REL_FUNC_MACRO(test_func_all)
-    TEST_UNARY_REL_FUNC_MACRO(test_func_any)
-#else
-    log_info("WARNING:\n\tTests for bool all(booln x) are not run in USE_OPENCLC_KERNELS mode\n");
-    log_info("WARNING:\n\tTests for bool any(booln x) are not run in USE_OPENCLC_KERNELS mode\n");
-#endif
-
-#undef TEST_UNARY_REL_FUNC_MACRO
-
-#define TEST_BINARY_REL_FUNC_MACRO(CLASS_NAME) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<1>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<2>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<4>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<8>()) \
-    TEST_BINARY_FUNC_MACRO(CLASS_NAME<16>())
-
-    TEST_BINARY_REL_FUNC_MACRO(test_func_isordered)
-    TEST_BINARY_REL_FUNC_MACRO(test_func_isunordered)
-
-#undef TEST_BINARY_REL_FUNC_MACRO
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_TEST_FUNCS_HPP
diff --git a/test_conformance/clcpp/spec_constants/CMakeLists.txt b/test_conformance/clcpp/spec_constants/CMakeLists.txt
deleted file mode 100644
index 3488a5a783..0000000000
--- a/test_conformance/clcpp/spec_constants/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_SPEC_CONSTANTS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/spec_constants/common.hpp b/test_conformance/clcpp/spec_constants/common.hpp
deleted file mode 100644
index 17b31aebae..0000000000
--- a/test_conformance/clcpp/spec_constants/common.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_COMMON_HPP
-
-#include <algorithm>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#define RUN_SPEC_CONSTANTS_TEST_MACRO(TEST_CLASS) \
-    last_error = run_spec_constants_test(  \
-        device, context, queue, n_elems, TEST_CLASS \
-    );  \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-// Base class for all tests of cl::spec_contatnt
-template <class T>
-struct spec_constants_test : public detail::base_func_type<T>
-{
-    // Output buffer type
-    typedef T type;
-
-    virtual ~spec_constants_test() {};
-    // Returns test name
-    virtual std::string str() = 0;
-    // Returns OpenCL program source
-    virtual std::string generate_program() = 0;
-
-    // Return names of test's kernels, in order.
-    // Typical case: one kernel.
-    virtual std::vector<std::string> get_kernel_names()
-    {
-        // Typical case, that is, only one kernel
-        return { this->get_kernel_name() };
-    }
-
-    // If local size has to be set in clEnqueueNDRangeKernel()
-    // this should return true; otherwise - false;
-    virtual bool set_local_size()
-    {
-        return false;
-    }
-
-    // Calculates maximal work-group size (one dim)
-    virtual size_t get_max_local_size(const std::vector<cl_kernel>& kernels,
-                                      cl_device_id device,
-                                      size_t work_group_size, // default work-group size
-                                      cl_int& error)
-    {
-        size_t wg_size = work_group_size;
-        for(auto& k : kernels)
-        {
-            size_t max_wg_size;
-            error = clGetKernelWorkGroupInfo(
-                k, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
-            );
-            RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-            wg_size = (std::min)(wg_size, max_wg_size);
-        }
-        return wg_size;
-    }
-
-    // Sets spec constants
-    // Typical case: no spec constants to set
-    virtual cl_int set_spec_constants(const cl_program& program)
-    {
-        return CL_SUCCESS;
-    }
-
-    // This covers typical case:
-    // 1. each kernel is executed once,
-    // 2. the only argument in every kernel is output_buffer
-    virtual cl_int execute(const std::vector<cl_kernel>& kernels,
-                           cl_mem& output_buffer,
-                           cl_command_queue& queue,
-                           size_t work_size,
-                           size_t work_group_size)
-    {
-        cl_int err;
-        for(auto& k : kernels)
-        {
-            err = clSetKernelArg(k, 0, sizeof(output_buffer), &output_buffer);
-            RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-            err = clEnqueueNDRangeKernel(
-                queue, k, 1,
-                NULL, &work_size, this->set_local_size() ? &work_group_size : NULL,
-                0, NULL, NULL
-            );
-            RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-        }
-        return err;
-    }
-
-    // This is a function which performs additional queries and checks
-    // if the results are correct. This method is run after checking that
-    // test results (output values) are correct.
-    virtual cl_int check_queries(const std::vector<cl_kernel>& kernels,
-                                 cl_device_id device,
-                                 cl_context context,
-                                 cl_command_queue queue)
-    {
-        (void) kernels;
-        (void) device;
-        (void) context;
-        (void) queue;
-        return CL_SUCCESS;
-    }
-};
-
-template <class spec_constants_test>
-int run_spec_constants_test(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, spec_constants_test op)
-{
-    cl_mem buffers[1];
-    cl_program program;
-    std::vector<cl_kernel> kernels;
-    size_t wg_size;
-    size_t work_size[1];
-    cl_int err;
-
-    typedef typename spec_constants_test::type TYPE;
-
-    // Don't run test for unsupported types
-    if(!(type_supported<TYPE>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = op.generate_program();
-    std::vector<std::string> kernel_names = op.get_kernel_names();
-    if(kernel_names.empty())
-    {
-        RETURN_ON_ERROR_MSG(-1, "No kernel to run");
-    }
-    kernels.resize(kernel_names.size());
-
-    std::string options = "";
-    if(is_extension_available(device, "cl_khr_fp16"))
-    {
-        options += " -cl-fp16-enable";
-    }
-    if(is_extension_available(device, "cl_khr_fp64"))
-    {
-        options += " -cl-fp64-enable";
-    }
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0], options);
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &(kernels[0]), code_str, kernel_names[0], "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#else
-    const char * code_c_str = code_str.c_str();
-    err = create_openclcpp_program(context, &program, 1, &(code_c_str), options.c_str());
-    RETURN_ON_ERROR_MSG(err, "Creating OpenCL C++ program failed")
-
-    // Set spec constants
-    err = op.set_spec_constants(program);
-    RETURN_ON_ERROR_MSG(err, "Setting Spec Constants failed")
-
-    // Build program and create 1st kernel
-    err = build_program_create_kernel_helper(
-        context, &program, &(kernels[0]), 1, &(code_c_str), kernel_names[0].c_str()
-    );
-    RETURN_ON_ERROR_MSG(err, "Unable to build program or to create kernel")
-    // Create other kernels
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#endif
-
-    // Find the max possible wg size for among all the kernels
-    wg_size = op.get_max_local_size(kernels, device, 1024, err);
-    RETURN_ON_ERROR(err);
-
-    work_size[0] = count;
-    if(op.set_local_size())
-    {
-        size_t wg_number = static_cast<size_t>(
-            std::ceil(static_cast<double>(count) / wg_size)
-        );
-        work_size[0] = wg_number * wg_size;
-    }
-
-    // host output vector
-    std::vector<TYPE> output = generate_output<TYPE>(work_size[0], 9999);
-
-    // device output buffer
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    // Execute test
-    err = op.execute(kernels, buffers[0], queue, work_size[0], wg_size);
-    RETURN_ON_ERROR(err)
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    // Check output values
-    for(size_t i = 0; i < output.size(); i++)
-    {
-        TYPE v = op(i, wg_size);
-        if(!(are_equal(v, output[i], detail::make_value<TYPE>(0), op)))
-        {
-            RETURN_ON_ERROR_MSG(-1,
-                "test_%s(%s) failed. Expected: %s, got: %s", op.str().c_str(), type_name<cl_uint>().c_str(),
-                format_value(v).c_str(), format_value(output[i]).c_str()
-            );
-        }
-    }
-
-    // Check if queries returns correct values
-    err = op.check_queries(kernels, device, context, queue);
-    RETURN_ON_ERROR(err);
-
-    log_info("test_%s(%s) passed\n", op.str().c_str(), type_name<TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    for(auto& k : kernels)
-        clReleaseKernel(k);
-    clReleaseProgram(program);
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_COMMON_HPP
diff --git a/test_conformance/clcpp/spec_constants/main.cpp b/test_conformance/clcpp/spec_constants/main.cpp
deleted file mode 100644
index 305eb7dc8c..0000000000
--- a/test_conformance/clcpp/spec_constants/main.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_spec_consts_attributes.hpp"
-#include "test_spec_consts_if.hpp"
-#include "test_spec_consts_init_vars.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/spec_constants/test_spec_consts_attributes.hpp b/test_conformance/clcpp/spec_constants/test_spec_consts_attributes.hpp
deleted file mode 100644
index 539167fd7e..0000000000
--- a/test_conformance/clcpp/spec_constants/test_spec_consts_attributes.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_ATTRIBUTES_HPP
-#define TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_ATTRIBUTES_HPP
-
-#include <type_traits>
-
-#include "common.hpp"
-
-// In this test we check if specialization constant can be successfully used
-// in kernel attribute cl::required_work_group_size(X, Y, Z).
-struct spec_const_required_work_group_size_test : public spec_constants_test<cl_uint>
-{
-    // See generate_program() to know what set_spec_constant is for.
-    spec_const_required_work_group_size_test(const bool set_spec_constant,
-                                             const cl_uint work_group_size_0)
-        : m_set_spec_constant(set_spec_constant),
-          m_work_group_size_0(work_group_size_0)
-    {
-
-    }
-
-    std::string str()
-    {
-        if(m_set_spec_constant)
-            return "spec_const_in_required_work_group_size_" + std::to_string(m_work_group_size_0);
-        else
-            return "spec_const_in_required_work_group_size_not_set";
-    }
-
-    bool set_local_size()
-    {
-        return true;
-    }
-
-    size_t get_max_local_size(const std::vector<cl_kernel>& kernels,
-                              cl_device_id device,
-                              size_t work_group_size, // default work-group size
-                              cl_int& error)
-    {
-        if(m_set_spec_constant)
-        {
-            return m_work_group_size_0;
-        }
-        return size_t(1);
-    }
-
-    cl_uint operator()(size_t i, size_t work_group_size)
-    {
-        (void) work_group_size;
-        if(m_set_spec_constant)
-        {
-            return m_work_group_size_0;
-        }
-        return cl_uint(1);
-    }
-
-    // Check if query for CL_KERNEL_COMPILE_WORK_GROUP_SIZE using clGetKernelWorkGroupInfo
-    // return correct values. It should return the work-group size specified by the
-    // cl::required_work_group_size(X, Y, Z) qualifier.
-    cl_int check_queries(const std::vector<cl_kernel>& kernels,
-                         cl_device_id device,
-                         cl_context context,
-                         cl_command_queue queue)
-    {
-        (void) device;
-        (void) context;
-        size_t compile_wg_size[] = { 1, 1, 1 };
-        cl_int error = clGetKernelWorkGroupInfo(
-            kernels[0], device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
-            sizeof(compile_wg_size), compile_wg_size, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-        if(m_set_spec_constant)
-        {
-            if(compile_wg_size[0] != m_work_group_size_0
-               || compile_wg_size[1] != 1
-               || compile_wg_size[2] != 1)
-            {
-                error = -1;
-            }
-        }
-        else
-        {
-            if(compile_wg_size[0] != 1
-               || compile_wg_size[1] != 1
-               || compile_wg_size[2] != 1)
-            {
-                error = -1;
-            }
-        }
-        return error;
-    }
-
-    // Sets spec constant
-    cl_int set_spec_constants(const cl_program& program)
-    {
-        cl_int error = CL_SUCCESS;
-        if(m_set_spec_constant)
-        {
-            error = clSetProgramSpecializationConstant(
-                program, cl_uint(1), sizeof(cl_uint), static_cast<void*>(&m_work_group_size_0)
-            );
-            RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-        }
-        return error;
-    }
-
-    // Each work-item writes get_local_size(0) to output[work-item-global-id]
-    std::string generate_program(bool with_attribute)
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-            std::string att = " ";
-            if(with_attribute)
-            {
-                std::string work_group_size_0 = "1";
-                if(m_set_spec_constant)
-                {
-                    work_group_size_0 = std::to_string(m_work_group_size_0);
-                }
-                att = "\n__attribute__((reqd_work_group_size(" + work_group_size_0 + ",1,1)))\n";
-            }
-            return
-                "__kernel" + att + "void " + this->get_kernel_name() + "(global uint *output)\n"
-                "{\n"
-                "    uint gid = get_global_id(0);\n"
-                "    output[gid] = get_local_size(0);\n"
-                "}\n";
-
-        #else
-            std::string att = "";
-            if(with_attribute)
-            {
-                att = "[[cl::required_work_group_size(spec1, 1, 1)]]\n";
-            }
-            return
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_spec_constant>\n"
-                "using namespace cl;\n"
-                "spec_constant<uint, 1> spec1{1};\n"
-                + att +
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output)\n"
-                "{\n"
-                "    uint gid = get_global_id(0);\n"
-                "    output[gid] = get_local_size(0);\n"
-                "}\n";
-        #endif
-    }
-
-    // Each work-item writes get_local_size(0) to output[work-item-global-id]
-    std::string generate_program()
-    {
-        return generate_program(true);
-    }
-
-private:
-    bool m_set_spec_constant;
-    cl_uint m_work_group_size_0;
-};
-
-// This function return max work-group size that can be used
-// for kernels defined in source
-size_t get_max_wg_size(const std::string& source,
-                       const std::vector<std::string>& kernel_names,
-                       size_t work_group_size, // max wg size we want to have
-                       cl_device_id device,
-                       cl_context context,
-                       cl_command_queue queue,
-                       cl_int& err)
-{
-    cl_program program;
-    std::vector<cl_kernel> kernels;
-    if(kernel_names.empty())
-    {
-        RETURN_ON_ERROR_MSG(-1, "No kernel to run");
-    }
-    kernels.resize(kernel_names.size());
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &(kernels[0]), source, kernel_names[0], "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#else
-    err = create_opencl_kernel(context, &program, &(kernels[0]), source, kernel_names[0]);
-    RETURN_ON_ERROR(err)
-    for(size_t i = 1; i < kernels.size(); i++)
-    {
-        kernels[i] = clCreateKernel(program, kernel_names[i].c_str(), &err);
-        RETURN_ON_CL_ERROR(err, "clCreateKernel");
-    }
-#endif
-    size_t wg_size = work_group_size;
-    for(auto& k : kernels)
-    {
-        size_t max_wg_size;
-        err = clGetKernelWorkGroupInfo(
-            k, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-        wg_size = (std::min)(wg_size, max_wg_size);
-    }
-    return wg_size;
-}
-
-AUTO_TEST_CASE(test_spec_constants_in_kernel_attributes)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// If ONLY_SPIRV_COMPILATION is defined we can't check the max work-group size for the
-// kernel because OpenCL kernel object is never created in that mode.
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    const size_t max_wg_size = 16;
-#else
-    // Get max work-group size that can be used in [[cl::required_work_group_size(X, 1, 1)]]
-    // We do this by building kernel without this attribute and checking what is the max
-    // work-group size we can use with it.
-    auto test = spec_const_required_work_group_size_test(true, 1);
-    const size_t max_wg_size = get_max_wg_size(
-        test.generate_program(false), test.get_kernel_names(),
-        1024, // max wg size we want to test
-        device, context, queue,
-        error
-    );
-    RETURN_ON_ERROR_MSG(error, "Can't get max work-group size");
-#endif
-
-    // Run tests when specialization constant spec1 is set (kernel
-    // attribute is [[cl::required_work_group_size(spec1, 1, 1)]]).
-    for(size_t i = 1; i <= max_wg_size; i *=2)
-    {
-        RUN_SPEC_CONSTANTS_TEST_MACRO(
-            spec_const_required_work_group_size_test(
-                true, i
-            )
-        );
-    }
-    // This test does not set spec constant
-    RUN_SPEC_CONSTANTS_TEST_MACRO(
-        spec_const_required_work_group_size_test(
-            false, 9999999 // This value is incorrect, but won't be set and kernel
-                           // attribute should be [[cl::required_work_group_size(1, 1, 1)]]
-        )
-    );
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_ATTRIBUTES_HPP
diff --git a/test_conformance/clcpp/spec_constants/test_spec_consts_if.hpp b/test_conformance/clcpp/spec_constants/test_spec_consts_if.hpp
deleted file mode 100644
index 1c7cec2abb..0000000000
--- a/test_conformance/clcpp/spec_constants/test_spec_consts_if.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_IF_HPP
-#define TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_IF_HPP
-
-#include <type_traits>
-
-#include "common.hpp"
-
-// This class tests using specialization constant in if statement
-template <class T /* spec constant type*/>
-struct spec_const_in_if_test : public spec_constants_test<cl_uint>
-{
-    // See generate_program() to know what set_spec_constant is for.
-    spec_const_in_if_test(const bool set_spec_constant)
-        : m_set_spec_constant(set_spec_constant)
-    {
-        static_assert(
-            is_vector_type<T>::value == false,
-            "Specialization constant can be only scalar int or float type"
-        );
-        switch (sizeof(T))
-        {
-            case 1:
-                m_test_value = T(127);
-                break;
-            case 2:
-                m_test_value = T(0xdeadU);
-                break;
-            // 4 and 8
-            default:
-                m_test_value = T(0xdeaddeadU);
-                break;
-        }
-    }
-
-    std::string str()
-    {
-        return "spec_const_in_if_" + type_name<T>();
-    }
-
-    cl_uint operator()(size_t i, size_t work_group_size)
-    {
-        (void) work_group_size;
-        if(m_set_spec_constant)
-        {
-            return m_test_value;
-        }
-        return static_cast<cl_uint>(i);
-    }
-
-    // Sets spec constant
-    cl_int set_spec_constants(const cl_program& program)
-    {
-        cl_int error = CL_SUCCESS;
-        if(m_set_spec_constant)
-        {
-            T spec1 = static_cast<T>(m_test_value);
-            error = clSetProgramSpecializationConstant(
-                program, cl_uint(1), sizeof(T), static_cast<void*>(&spec1)
-            );
-            RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-        }
-        return error;
-    }
-
-    // IF set_spec_constant == true:
-    // each work-item writes T(m_test_value) to output[work-item-global-id]
-    // Otherwise:
-    // each work-item writes T(get_global_id(0)) to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            std::string result = "gid";
-            if(m_set_spec_constant)
-                result = std::to_string(m_test_value);
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global uint *output)\n"
-                "{\n"
-                "    uint gid = get_global_id(0);\n"
-                "    output[gid] = " + result + ";\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_spec_constant>\n"
-                "using namespace cl;\n"
-                "typedef " + type_name<T>() + " TYPE;\n"
-                "spec_constant<TYPE,  1> spec1{TYPE(0)};\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output)\n"
-                "{\n"
-                "    uint gid = get_global_id(0);\n"
-                "    if(get(spec1) == TYPE(" + std::to_string(m_test_value) +"))\n"
-                "    {\n"
-                "        output[gid] = " + std::to_string(m_test_value) +";\n"
-                "    }\n"
-                "    else\n"
-                "    {\n"
-                "        output[gid] = gid;\n"
-                "    }\n"
-                "}\n";        
-        #endif
-    }
-
-private:
-    bool m_set_spec_constant;
-    cl_uint m_test_value;
-};
-
-AUTO_TEST_CASE(test_spec_constants_in_if_statement)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    const std::vector<bool> set_spec_const_options { true, false };
-    for(auto option : set_spec_const_options)
-    {        
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_char>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_uchar>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_int>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_uint>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_long>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_ulong>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_float>(option));
-        if(is_extension_available(device, "cl_khr_fp16"))
-        {
-            RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_half>(option));
-        }
-        if(is_extension_available(device, "cl_khr_fp64"))
-        {
-            RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_in_if_test<cl_double>(option));
-        }
-    }
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_IF_HPP
diff --git a/test_conformance/clcpp/spec_constants/test_spec_consts_init_vars.hpp b/test_conformance/clcpp/spec_constants/test_spec_consts_init_vars.hpp
deleted file mode 100644
index 20bbff06bc..0000000000
--- a/test_conformance/clcpp/spec_constants/test_spec_consts_init_vars.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_INIT_VARS_HPP
-#define TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_INIT_VARS_HPP
-
-#include <type_traits>
-
-#include "common.hpp"
-
-// This class tests initializing variables with a specialization constant value.
-template <class T /* spec constant type*/>
-struct spec_const_init_var : public spec_constants_test<cl_uint>
-{
-    // See generate_program() to know what set_spec_constant is for.
-    spec_const_init_var(const bool set_spec_constant)
-        : m_set_spec_constant(set_spec_constant)
-    {
-        static_assert(
-            is_vector_type<T>::value == false,
-            "Specialization constant can be only scalar int or float type"
-        );
-        switch (sizeof(T))
-        {
-            case 1:
-                m_test_value = T(127);
-                break;
-            case 2:
-                m_test_value = T(0xdeadU);
-                break;
-            // 4 and 8
-            default:
-                m_test_value = T(0xdeaddeadU);
-                break;
-        }
-    }
-
-    std::string str()
-    {
-        return "spec_const_init_var_" + type_name<T>();
-    }
-
-    cl_uint operator()(size_t i, size_t work_group_size)
-    {
-        (void) work_group_size;
-        if(m_set_spec_constant)
-        {
-            return m_test_value;
-        }
-        return static_cast<cl_uint>(i);
-    }
-
-    // Sets spec constant
-    cl_int set_spec_constants(const cl_program& program)
-    {
-        cl_int error = CL_SUCCESS;
-        if(m_set_spec_constant)
-        {
-            T spec = static_cast<T>(m_test_value);
-            // spec1
-            error = clSetProgramSpecializationConstant(
-                program, cl_uint(1), sizeof(T), static_cast<void*>(&spec)
-            );
-            RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-
-            // spec2
-            error = clSetProgramSpecializationConstant(
-                program, cl_uint(2), sizeof(T), static_cast<void*>(&spec)
-            );
-            RETURN_ON_CL_ERROR(error, "clSetProgramSpecializationConstant")
-        }
-        return error;
-    }
-
-    // IF set_spec_constant == true:
-    // each work-item writes T(m_test_value) to output[work-item-global-id]
-    // Otherwise:
-    // each work-item writes T(get_global_id(0)) to output[work-item-global-id]
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) 
-            std::string result = "gid";
-            if(m_set_spec_constant)
-                result = std::to_string(m_test_value);
-            return 
-                "__kernel void " + this->get_kernel_name() + "(global uint *output)\n"
-                "{\n"
-                "    uint gid = get_global_id(0);\n"
-                "    output[gid] = " + result + ";\n"
-                "}\n";
-
-        #else
-            return         
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_spec_constant>\n"
-                "using namespace cl;\n"
-                "typedef " + type_name<T>() + " TYPE;\n"
-                "spec_constant<TYPE,  1> spec1{TYPE(0)};\n"
-                "spec_constant<TYPE,  2> spec2{TYPE(0)};\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output)\n"
-                "{\n"
-                "    uint gid = get_global_id(0);\n"
-                "    TYPE var1(spec1.get());\n"
-                "    TYPE var2(spec2);\n"
-                "    TYPE var3; var3 = spec2;\n"
-                "    if((var1 == TYPE(" + std::to_string(m_test_value) +")) "
-                       "&& (var2 == TYPE(" + std::to_string(m_test_value) +"))\n"
-                       "&& (var3 == TYPE(" + std::to_string(m_test_value) +")))\n"
-                "    {\n"
-                "        output[gid] = " + std::to_string(m_test_value) +";\n"
-                "    }\n"
-                "    else\n"
-                "    {\n"
-                "        output[gid] = gid;\n"
-                "    }\n"
-                "}\n";        
-        #endif
-    }
-
-private:
-    bool m_set_spec_constant;
-    cl_uint m_test_value;
-};
-
-AUTO_TEST_CASE(test_spec_constants_initializing_variables)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{    
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    const std::vector<bool> set_spec_const_options { true, false };
-    for(auto option : set_spec_const_options)
-    {        
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_char>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_uchar>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_int>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_uint>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_long>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_ulong>(option));
-        RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_float>(option));
-        if(is_extension_available(device, "cl_khr_fp16"))
-        {
-            RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_half>(option));
-        }
-        if(is_extension_available(device, "cl_khr_fp64"))
-        {
-            RUN_SPEC_CONSTANTS_TEST_MACRO(spec_const_init_var<cl_double>(option));
-        }
-    }
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }    
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SPEC_CONSTANTS_TEST_SPEC_CONSTS_INIT_VARS_HPP
diff --git a/test_conformance/clcpp/spirv10_2016.04.27.7z b/test_conformance/clcpp/spirv10_2016.04.27.7z
deleted file mode 100644
index 306be24308..0000000000
--- a/test_conformance/clcpp/spirv10_2016.04.27.7z
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fe4f34d616ed7ef70e870c22078f60655f68b0c5191c8d8b9d045dd0e7390bc2
-size 5529152
diff --git a/test_conformance/clcpp/subgroups/CMakeLists.txt b/test_conformance/clcpp/subgroups/CMakeLists.txt
deleted file mode 100644
index c8307d26d1..0000000000
--- a/test_conformance/clcpp/subgroups/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_SUBGROUPS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/subgroups/common.hpp b/test_conformance/clcpp/subgroups/common.hpp
deleted file mode 100644
index 2b05a3cbc9..0000000000
--- a/test_conformance/clcpp/subgroups/common.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SG_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_SG_COMMON_HPP
-
-#include <string>
-#include <vector>
-#include <limits>
-
-enum class work_group_op : int {
-    add, min, max
-};
-
-std::string to_string(work_group_op op)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return "add";
-        case work_group_op::min:
-            return "min";
-        case work_group_op::max:
-            return "max";
-        default:
-            break;
-    }
-    return "";
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-std::vector<CL_INT_TYPE> generate_input(size_t count, size_t wg_size)
-{
-    std::vector<CL_INT_TYPE> input(count, CL_INT_TYPE(1));
-    switch (op)
-    {
-        case work_group_op::add:
-            return input;
-        case work_group_op::min:
-            {
-                size_t j = wg_size;
-                for(size_t i = 0; i < count; i++)
-                {
-                    input[i] = static_cast<CL_INT_TYPE>(j);
-                    j--;
-                    if(j == 0)
-                    {
-                        j = wg_size;
-                    }
-                }
-            }
-            break;
-        case work_group_op::max:
-            {
-                size_t j = 0;
-                for(size_t i = 0; i < count; i++)
-                {
-                    input[i] = static_cast<CL_INT_TYPE>(j);
-                    j++;
-                    if(j == wg_size)
-                    {
-                        j = 0;
-                    }
-                }
-            }
-    }
-    return input;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-std::vector<CL_INT_TYPE> generate_output(size_t count, size_t wg_size)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return std::vector<CL_INT_TYPE>(count, CL_INT_TYPE(0));
-        case work_group_op::min:
-            return std::vector<CL_INT_TYPE>(count, (std::numeric_limits<CL_INT_TYPE>::max)());
-        case work_group_op::max:
-            return std::vector<CL_INT_TYPE>(count, (std::numeric_limits<CL_INT_TYPE>::min)());
-    }
-    return std::vector<CL_INT_TYPE>(count, CL_INT_TYPE(0));
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SG_COMMON_HPP
diff --git a/test_conformance/clcpp/subgroups/main.cpp b/test_conformance/clcpp/subgroups/main.cpp
deleted file mode 100644
index c81f2315ff..0000000000
--- a/test_conformance/clcpp/subgroups/main.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_sg_all.hpp"
-#include "test_sg_any.hpp"
-#include "test_sg_broadcast.hpp"
-#include "test_sg_reduce.hpp"
-#include "test_sg_scan_inclusive.hpp"
-#include "test_sg_scan_exclusive.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/subgroups/test_sg_all.hpp b/test_conformance/clcpp/subgroups/test_sg_all.hpp
deleted file mode 100644
index 5dc060cef2..0000000000
--- a/test_conformance/clcpp/subgroups/test_sg_all.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_ALL_HPP
-#define TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_ALL_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of sub-group functions
-#include "common.hpp"
-
-std::string generate_sg_all_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_sg_all(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    bool result = sub_group_all(input[tid] < input[tid+1]);\n"
-           "    if(!result) {\n        output[tid] = 0;\n        return;\n    }\n"
-           "    output[tid] = 1;\n"
-           "}\n";
-}
-
-int verify_sg_all(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t count, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < count; i += wg_size)
-    {
-        for (j = 0; j < ((count - i) > wg_size ? wg_size : (count - i)); j+= sg_size)
-        {
-            // sub-group all
-            bool all = true;
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if(!(in[i+j+k] < in[i+j+k+1]))
-                {
-                    all = false;
-                    break;
-                }
-            }
-
-            // Convert bool to uint
-            cl_uint all_uint = all ? 1 : 0;
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (all_uint != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_all %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(all_uint),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-std::vector<cl_uint> generate_input_sg_all(size_t count, size_t wg_size)
-{
-    std::vector<cl_uint> input(count, cl_uint(0));
-    size_t j = wg_size;
-    for(size_t i = 0; i < count; i++)
-    {
-        input[i] = static_cast<cl_uint>(i);
-        // In one place in ~half of work-groups (input[tid] < input[tid+1]) will
-        // generate false, it means that for sub_group_all(input[tid] < input[tid+1])
-        // should return false for all sub-groups in that work-groups
-        if((j == wg_size/2) && (i > count/2))
-        {
-            input[i] = input[i - 1];
-        }
-        j--;
-        if(j == 0)
-        {
-            j = wg_size;
-        }
-    }
-    return input;
-}
-
-std::vector<cl_uint> generate_output_sg_all(size_t count, size_t wg_size)
-{
-    (void) wg_size;
-    return std::vector<cl_uint>(count, cl_uint(1));
-}
-
-int sub_group_all(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t sg_max_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_sg_all_kernel_code();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_all");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    log_info("SKIPPED: OpenCL C kernels not provided for this test. Skipping the test.\n");
-    return CL_SUCCESS;
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_all");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    size_t param_value_size = 0;
-    err = clGetKernelSubGroupInfo(
-        kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
-        sizeof(size_t), static_cast<void*>(&wg_size),
-        sizeof(size_t), static_cast<void*>(&sg_max_size),
-        &param_value_size
-    );
-    RETURN_ON_CL_ERROR(err, "clGetKernelSubGroupInfo")
-
-    // Verify size of returned param
-    if(param_value_size != sizeof(size_t))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "Returned size of max sub group size not valid! (Expected %lu, got %lu)\n",
-            sizeof(size_t),
-            param_value_size
-        )
-    }
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<cl_uint> input = generate_input_sg_all(flat_work_size + 1, wg_size);
-    std::vector<cl_uint> output = generate_output_sg_all(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_sg_all(input, output, flat_work_size, wg_size, sg_max_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "sub_group_all failed");
-    }
-    log_info("sub_group_all passed\n");
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_sub_group_all)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int err = CL_SUCCESS;
-    err = sub_group_all(device, context, queue, n_elems);
-    CHECK_ERROR(err)
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_ALL_HPP
diff --git a/test_conformance/clcpp/subgroups/test_sg_any.hpp b/test_conformance/clcpp/subgroups/test_sg_any.hpp
deleted file mode 100644
index 4c6adce91c..0000000000
--- a/test_conformance/clcpp/subgroups/test_sg_any.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_ANY_HPP
-#define TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_ANY_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of sub-group functions
-#include "common.hpp"
-
-std::string generate_sg_any_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_sg_any(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    bool result = sub_group_any(input[tid] == input[tid+1]);\n"
-           "    if(!result) {\n        output[tid] = 0;\n        return;\n    }\n"
-           "    output[tid] = 1;\n"
-           "}\n";
-}
-
-int verify_sg_any(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t count, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < count; i += wg_size)
-    {
-        for (j = 0; j < ((count - i) > wg_size ? wg_size : (count - i)); j+= sg_size)
-        {
-            // sub-group any
-            bool any = false;
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if(in[i+j+k] == in[i+j+k+1])
-                {
-                    any = true;
-                    break;
-                }
-            }
-
-            // Convert bool to uint
-            cl_uint any_uint = any ? 1 : 0;
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (any_uint != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_any %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(any_uint),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-std::vector<cl_uint> generate_input_sg_any(size_t count, size_t wg_size)
-{
-    std::vector<cl_uint> input(count, cl_uint(0));
-    size_t j = wg_size;
-    for(size_t i = 0; i < count; i++)
-    {
-        input[i] = static_cast<cl_uint>(i);
-        // In one place in ~half of work-groups (input[tid] == input[tid+1]) will
-        // generate true, it means that for sub_group_all(input[tid] == input[tid+1])
-        // should return false for one sub-group in that work-groups
-        if((j == wg_size/2) && (i > count/2))
-        {
-            input[i] = input[i - 1];
-        }
-        j--;
-        if(j == 0)
-        {
-            j = wg_size;
-        }
-    }
-    return input;
-}
-
-std::vector<cl_uint> generate_output_sg_any(size_t count, size_t wg_size)
-{
-    (void) wg_size;
-    return std::vector<cl_uint>(count, cl_uint(1));
-}
-
-int sub_group_any(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t sg_max_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_sg_any_kernel_code();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_any");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    log_info("SKIPPED: OpenCL C kernels not provided for this test. Skipping the test.\n");
-    return CL_SUCCESS;
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_any");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    size_t param_value_size = 0;
-    err = clGetKernelSubGroupInfo(
-        kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
-        sizeof(size_t), static_cast<void*>(&wg_size),
-        sizeof(size_t), static_cast<void*>(&sg_max_size),
-        &param_value_size
-    );
-    RETURN_ON_CL_ERROR(err, "clGetKernelSubGroupInfo")
-
-    // Verify size of returned param
-    if(param_value_size != sizeof(size_t))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "Returned size of max sub group size not valid! (Expected %lu, got %lu)\n",
-            sizeof(size_t),
-            param_value_size
-        )
-    }
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<cl_uint> input = generate_input_sg_any(flat_work_size + 1, wg_size);
-    std::vector<cl_uint> output = generate_output_sg_any(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_sg_any(input, output, flat_work_size, wg_size, sg_max_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "sub_group_any failed");
-    }
-    log_info("sub_group_any passed\n");
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_sub_group_any)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int err = CL_SUCCESS;
-    err = sub_group_any(device, context, queue, n_elems);
-    CHECK_ERROR(err)
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_ANY_HPP
diff --git a/test_conformance/clcpp/subgroups/test_sg_broadcast.hpp b/test_conformance/clcpp/subgroups/test_sg_broadcast.hpp
deleted file mode 100644
index 22317be592..0000000000
--- a/test_conformance/clcpp/subgroups/test_sg_broadcast.hpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_BROADCAST_HPP
-#define TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_BROADCAST_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of sub-group functions
-#include "common.hpp"
-
-std::string generate_sg_broadcast_kernel_code()
-{
-    return
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "#include <opencl_work_group>\n"
-        "using namespace cl;\n"
-        "__kernel void test_sg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-        "{\n"
-        "    ulong tid = get_global_id(0);\n"
-        "    uint result = sub_group_broadcast(input[tid], 0);\n"
-        "    output[tid] = result;\n"
-        "}\n";
-}
-
-int
-verify_sg_broadcast(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t count, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < count; i += wg_size)
-    {
-        for (j = 0; j < ((count - i) > wg_size ? wg_size : (count - i)); j+= sg_size)
-        {
-            // sub-group broadcast
-            cl_uint broadcast_result = in[i+j];
-
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (broadcast_result != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_any %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(broadcast_result),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-std::vector<cl_uint> generate_input_sg_broadcast(size_t count, size_t wg_size)
-{
-    std::vector<cl_uint> input(count, cl_uint(0));
-    size_t j = wg_size;
-    for(size_t i = 0; i < count; i++)
-    {
-        input[i] = static_cast<cl_uint>(j);
-        j--;
-        if(j == 0)
-        {
-            j = wg_size;
-        }
-    }
-    return input;
-}
-
-std::vector<cl_uint> generate_output_sg_broadcast(size_t count, size_t wg_size)
-{
-    (void) wg_size;
-    return std::vector<cl_uint>(count, cl_uint(1));
-}
-
-int sub_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t sg_max_size;
-    size_t work_size[] = { 1 };
-    int err;
-
-    // Get kernel source code
-    std::string code_str = generate_sg_broadcast_kernel_code();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_broadcast");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    log_info("SKIPPED: OpenCL C kernels not provided for this test. Skipping the test.\n");
-    return CL_SUCCESS;
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_broadcast");
-    RETURN_ON_ERROR(err)
-#endif
-
-    // Get max flat workgroup size
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    size_t param_value_size = 0;
-    err = clGetKernelSubGroupInfo(
-        kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
-        sizeof(size_t), static_cast<void*>(&wg_size),
-        sizeof(size_t), static_cast<void*>(&sg_max_size),
-        &param_value_size
-    );
-    RETURN_ON_CL_ERROR(err, "clGetKernelSubGroupInfo")
-
-    // Verify size of returned param
-    if(param_value_size != sizeof(size_t))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "Returned size of max sub group size not valid! (Expected %lu, got %lu)\n",
-            sizeof(size_t),
-            param_value_size
-        )
-    }
-
-    // Calculate global work size
-    size_t flat_work_size = count;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<cl_uint> input = generate_input_sg_broadcast(flat_work_size, wg_size);
-    std::vector<cl_uint> output = generate_output_sg_broadcast(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    int result = verify_sg_broadcast( input, output, work_size[0], wg_size, sg_max_size);
-    RETURN_ON_ERROR_MSG(result, "sub_group_broadcast failed")
-    log_info("sub_group_broadcast passed\n");
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_sub_group_broadcast)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int err = CL_SUCCESS;
-    err = sub_group_broadcast(device, context, queue, n_elems);
-    CHECK_ERROR(err)
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_BROADCAST_HPP
diff --git a/test_conformance/clcpp/subgroups/test_sg_reduce.hpp b/test_conformance/clcpp/subgroups/test_sg_reduce.hpp
deleted file mode 100644
index 91acd474f9..0000000000
--- a/test_conformance/clcpp/subgroups/test_sg_reduce.hpp
+++ /dev/null
@@ -1,348 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_REDUCE_HPP
-#define TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_REDUCE_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of sub-group functions
-#include "common.hpp"
-
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_sg_reduce_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_sg_reduce(global_ptr<" + type_name<CL_INT_TYPE>() + "[]> input, "
-                                        "global_ptr<" + type_name<CL_INT_TYPE>() + "[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    " + type_name<CL_INT_TYPE>() + " result = sub_group_reduce<work_group_op::" + to_string(op) + ">(input[tid]);\n"
-           "    output[tid] = result;\n"
-           "}\n";
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_reduce_add(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE sum = 0;
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                sum += in[i + j + k];
-            }
-
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (sum != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_reduce_add %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(sum),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_reduce_min(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE min = (std::numeric_limits<CL_INT_TYPE>::max)();
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                min = std::min<CL_INT_TYPE>(min, in[i + j + k]);
-            }
-
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (min != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_reduce_min %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(min),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_reduce_max(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE max = (std::numeric_limits<CL_INT_TYPE>::min)();
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                max = std::max<CL_INT_TYPE>(max, in[i + j + k]);
-            }
-
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (max != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_reduce_max %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(max),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int verify_sg_reduce(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return verify_sg_reduce_add(in, out, wg_size, sg_size);
-        case work_group_op::min:
-            return verify_sg_reduce_min(in, out, wg_size, sg_size);
-        case work_group_op::max:
-            return verify_sg_reduce_max(in, out, wg_size, sg_size);
-    }
-    return -1;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int sub_group_reduce(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    // don't run test for unsupported types
-    if(!type_supported<CL_INT_TYPE>(device))
-    {
-        return CL_SUCCESS;
-    }
-
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t sg_max_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_sg_reduce_kernel_code<CL_INT_TYPE, op>();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_reduce");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    log_info("SKIPPED: OpenCL C kernels not provided for this test. Skipping the test.\n");
-    return CL_SUCCESS;
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_reduce");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    size_t param_value_size = 0;
-    err = clGetKernelSubGroupInfo(
-        kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
-        sizeof(size_t), static_cast<void*>(&wg_size),
-        sizeof(size_t), static_cast<void*>(&sg_max_size),
-        &param_value_size
-    );
-    RETURN_ON_CL_ERROR(err, "clGetKernelSubGroupInfo")
-
-    // Verify size of returned param
-    if(param_value_size != sizeof(size_t))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "Returned size of max sub group size not valid! (Expected %lu, got %lu)\n",
-            sizeof(size_t),
-            param_value_size
-        )
-    }
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<CL_INT_TYPE> input = generate_input<CL_INT_TYPE, op>(flat_work_size, wg_size);
-    std::vector<CL_INT_TYPE> output = generate_output<CL_INT_TYPE, op>(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(CL_INT_TYPE) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(CL_INT_TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(CL_INT_TYPE) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(CL_INT_TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_sg_reduce<CL_INT_TYPE, op>(input, output, wg_size, sg_max_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "sub_group_reduce_%s %s failed", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-    }
-    log_info("sub_group_reduce_%s %s passed\n", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_sub_group_reduce_add)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_reduce<cl_int, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_uint, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_long, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_ulong, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_sub_group_reduce_min)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_reduce<cl_int, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_uint, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_long, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_ulong, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_sub_group_reduce_max)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_reduce<cl_int, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_uint, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_long, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_reduce<cl_ulong, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_REDUCE_HPP
diff --git a/test_conformance/clcpp/subgroups/test_sg_scan_exclusive.hpp b/test_conformance/clcpp/subgroups/test_sg_scan_exclusive.hpp
deleted file mode 100644
index 72081750ec..0000000000
--- a/test_conformance/clcpp/subgroups/test_sg_scan_exclusive.hpp
+++ /dev/null
@@ -1,328 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_SCAN_EXCLUSIVE_HPP
-#define TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_SCAN_EXCLUSIVE_HPP
-
-#include <vector>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of sub-group functions
-#include "common.hpp"
-
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_sg_scan_exclusive_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_sg_scan_exclusive(global_ptr<" + type_name<CL_INT_TYPE>() + "[]> input, "
-                                                "global_ptr<" + type_name<CL_INT_TYPE>() + "[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    " + type_name<CL_INT_TYPE>() + " result = sub_group_scan_exclusive<work_group_op::" + to_string(op) + ">(input[tid]);\n"
-           "    output[tid] = result;\n"
-           "}\n";
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_scan_exclusive_add(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE sum = 0;
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (sum != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_scan_exclusive_add %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(sum),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-                sum += in[i + j + k];
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_scan_exclusive_min(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE min = (std::numeric_limits<CL_INT_TYPE>::max)();
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (min != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_scan_exclusive_min %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(min),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-                min = std::min<CL_INT_TYPE>(min, in[i + j + k]);
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_scan_exclusive_max(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE max = (std::numeric_limits<CL_INT_TYPE>::min)();
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                if (max != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_scan_exclusive_max %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(max),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-                max = std::max<CL_INT_TYPE>(max, in[i + j + k]);
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int verify_sg_scan_exclusive(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return verify_sg_scan_exclusive_add(in, out, wg_size, sg_size);
-        case work_group_op::min:
-            return verify_sg_scan_exclusive_min(in, out, wg_size, sg_size);
-        case work_group_op::max:
-            return verify_sg_scan_exclusive_max(in, out, wg_size, sg_size);
-    }
-    return -1;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int sub_group_scan_exclusive(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    // don't run test for unsupported types
-    if(!type_supported<CL_INT_TYPE>(device))
-    {
-        return CL_SUCCESS;
-    }
-
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t sg_max_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_sg_scan_exclusive_kernel_code<CL_INT_TYPE, op>();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_scan_exclusive");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    log_info("SKIPPED: OpenCL C kernels not provided for this test. Skipping the test.\n");
-    return CL_SUCCESS;
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_scan_exclusive");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    size_t param_value_size = 0;
-    err = clGetKernelSubGroupInfo(
-        kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
-        sizeof(size_t), static_cast<void*>(&wg_size),
-        sizeof(size_t), static_cast<void*>(&sg_max_size),
-        &param_value_size
-    );
-    RETURN_ON_CL_ERROR(err, "clGetKernelSubGroupInfo")
-
-    // Verify size of returned param
-    if(param_value_size != sizeof(size_t))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "Returned size of max sub group size not valid! (Expected %lu, got %lu)\n",
-            sizeof(size_t),
-            param_value_size
-        )
-    }
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<CL_INT_TYPE> input = generate_input<CL_INT_TYPE, op>(flat_work_size, wg_size);
-    std::vector<CL_INT_TYPE> output = generate_output<CL_INT_TYPE, op>(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(CL_INT_TYPE) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(CL_INT_TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(CL_INT_TYPE) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(CL_INT_TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_sg_scan_exclusive<CL_INT_TYPE, op>(input, output, wg_size, sg_max_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "sub_group_scan_exclusive_%s %s failed", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-    }
-    log_info("sub_group_scan_exclusive_%s %s passed\n", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_sub_group_scan_exclusive_add)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_scan_exclusive<cl_int, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_exclusive<cl_uint, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_exclusive<cl_long, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_exclusive<cl_ulong, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_sub_group_scan_exclusive_min)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_scan_exclusive<cl_int, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    local_error = sub_group_scan_exclusive<cl_uint, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    local_error = sub_group_scan_exclusive<cl_long, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    local_error = sub_group_scan_exclusive<cl_ulong, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_sub_group_scan_exclusive_max)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_scan_exclusive<cl_int, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_exclusive<cl_uint, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_exclusive<cl_long, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_exclusive<cl_ulong, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_SCAN_EXCLUSIVE_HPP
diff --git a/test_conformance/clcpp/subgroups/test_sg_scan_inclusive.hpp b/test_conformance/clcpp/subgroups/test_sg_scan_inclusive.hpp
deleted file mode 100644
index 0218668cbe..0000000000
--- a/test_conformance/clcpp/subgroups/test_sg_scan_inclusive.hpp
+++ /dev/null
@@ -1,335 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_SCAN_INCLUSIVE_HPP
-#define TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_SCAN_INCLUSIVE_HPP
-
-#include <vector>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of sub-group functions
-#include "common.hpp"
-
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_sg_scan_inclusive_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_sg_scan_inclusive(global_ptr<" + type_name<CL_INT_TYPE>() + "[]> input, "
-                                                "global_ptr<" + type_name<CL_INT_TYPE>() + "[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    " + type_name<CL_INT_TYPE>() + " result = sub_group_scan_inclusive<work_group_op::" + to_string(op) + ">(input[tid]);\n"
-           "    output[tid] = result;\n"
-           "}\n";
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_scan_inclusive_add(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE sum = 0;
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                sum += in[i + j + k];
-                if (sum != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_scan_exclusive_add %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(sum),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_scan_inclusive_min(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE min = (std::numeric_limits<CL_INT_TYPE>::max)();
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                min = std::min<CL_INT_TYPE>(min, in[i + j + k]);
-                if (min != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_scan_exclusive_min %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(min),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_sg_scan_inclusive_max(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    size_t i, j, k;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j+= sg_size)
-        {
-            CL_INT_TYPE max = (std::numeric_limits<CL_INT_TYPE>::min)();
-            // Check if all work-items in sub-group stored correct value
-            for (k = 0; k < ((wg_size - j) > sg_size ? sg_size : (wg_size - j)); k++)
-            {
-                max = std::max<CL_INT_TYPE>(max, in[i + j + k]);
-                if (max != out[i + j + k])
-                {
-                    log_info(
-                        "sub_group_scan_exclusive_max %s: Error at %lu: expected = %lu, got = %lu\n",
-                        type_name<cl_uint>().c_str(),
-                        i + j,
-                        static_cast<size_t>(max),
-                        static_cast<size_t>(out[i + j + k]));
-                    return -1;
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int verify_sg_scan_inclusive(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size, size_t sg_size)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return verify_sg_scan_inclusive_add(in, out, wg_size, sg_size);
-        case work_group_op::min:
-            return verify_sg_scan_inclusive_min(in, out, wg_size, sg_size);
-        case work_group_op::max:
-            return verify_sg_scan_inclusive_max(in, out, wg_size, sg_size);
-    }
-    return -1;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int sub_group_scan_inclusive(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    // don't run test for unsupported types
-    if(!type_supported<CL_INT_TYPE>(device))
-    {
-        return CL_SUCCESS;
-    }
-
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t sg_max_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_sg_scan_inclusive_kernel_code<CL_INT_TYPE, op>();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_scan_inclusive");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    log_info("SKIPPED: OpenCL C kernels not provided for this test. Skipping the test.\n");
-    return CL_SUCCESS;
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_sg_scan_inclusive");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    size_t param_value_size = 0;
-    err = clGetKernelSubGroupInfo(
-        kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
-        sizeof(size_t), static_cast<void*>(&wg_size),
-        sizeof(size_t), static_cast<void*>(&sg_max_size),
-        &param_value_size
-    );
-    RETURN_ON_CL_ERROR(err, "clGetKernelSubGroupInfo")
-
-    // Verify size of returned param
-    if(param_value_size != sizeof(size_t))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "Returned size of max sub group size not valid! (Expected %lu, got %lu)\n",
-            sizeof(size_t),
-            param_value_size
-        )
-    }
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<CL_INT_TYPE> input = generate_input<CL_INT_TYPE, op>(flat_work_size, wg_size);
-    std::vector<CL_INT_TYPE> output = generate_output<CL_INT_TYPE, op>(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(CL_INT_TYPE) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(CL_INT_TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(CL_INT_TYPE) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(CL_INT_TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_sg_scan_inclusive<CL_INT_TYPE, op>(input, output, wg_size, sg_max_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "sub_group_scan_inclusive_%s %s failed", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-    }
-    log_info("sub_group_scan_inclusive_%s %s passed\n", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_sub_group_scan_inclusive_add)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_scan_inclusive<cl_int, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_uint, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_long, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_ulong, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_sub_group_scan_inclusive_min)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_scan_inclusive<cl_int, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_uint, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_long, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_ulong, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_sub_group_scan_inclusive_max)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = sub_group_scan_inclusive<cl_int, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_uint, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_long, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = sub_group_scan_inclusive<cl_ulong, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_SUBGROUPS_TEST_SG_SCAN_INCLUSIVE_HPP
diff --git a/test_conformance/clcpp/synchronization/CMakeLists.txt b/test_conformance/clcpp/synchronization/CMakeLists.txt
deleted file mode 100644
index 70d3637cb8..0000000000
--- a/test_conformance/clcpp/synchronization/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_SYNCHRONIZATION)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/synchronization/main.cpp b/test_conformance/clcpp/synchronization/main.cpp
deleted file mode 100644
index 04b5f36a54..0000000000
--- a/test_conformance/clcpp/synchronization/main.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_work_group_barrier.hpp"
-#include "test_sub_group_barrier.hpp"
-#include "named_barrier/test_spec_example.hpp"
-#include "named_barrier/test_named_barrier.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/synchronization/named_barrier/common.hpp b/test_conformance/clcpp/synchronization/named_barrier/common.hpp
deleted file mode 100644
index e6ce8b208c..0000000000
--- a/test_conformance/clcpp/synchronization/named_barrier/common.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_COMMON_HPP
-
-#include <vector>
-
-// Common for all OpenCL C++ tests
-#include "../../common.hpp"
-#include "../../funcs_test_utils.hpp"
-
-#define RUN_WG_NAMED_BARRIER_TEST_MACRO(TEST_CLASS) \
-    last_error = run_work_group_named_barrier_barrier_test(  \
-        device, context, queue, num_elements, TEST_CLASS \
-    );  \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-namespace named_barrier {
-
-struct work_group_named_barrier_test_base : public detail::base_func_type<cl_uint>
-{
-    // Returns test name
-    virtual std::string str() = 0;
-    // Returns OpenCL program source
-    // It's assumed that this program has only one kernel.
-    virtual std::string generate_program() = 0;
-    // Return value that is expected to be in output_buffer[i]
-    virtual cl_uint operator()(size_t i, size_t work_group_size, size_t mas_sub_group_size) = 0;
-    // Kernel execution
-    // This covers typical case: kernel is executed once, kernel
-    // has only one argument which is output buffer
-    virtual cl_int execute(const cl_kernel kernel,
-                           const cl_mem output_buffer,
-                           const cl_command_queue& queue,
-                           const size_t work_size,
-                           const size_t work_group_size)
-    {
-        cl_int err;
-        err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-        RETURN_ON_CL_ERROR(err, "clSetKernelArg")
-
-        err = clEnqueueNDRangeKernel(
-            queue, kernel, 1,
-            NULL, &work_size, &work_group_size,
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
-        return err;
-    }
-    // Calculates maximal work-group size (one dim)
-    virtual size_t get_max_local_size(const cl_kernel kernel,
-                                      const cl_device_id device,
-                                      const size_t work_group_size, // default work-group size
-                                      cl_int& error)
-    {
-        size_t max_wg_size;
-        error = clGetKernelWorkGroupInfo(
-            kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
-        );
-        RETURN_ON_ERROR(error)
-        return (std::min)(work_group_size, max_wg_size);
-    }
-    // if work-groups should be uniform
-    virtual bool enforce_uniform()
-    {
-        return false;
-    }
-};
-
-template <class work_group_named_barrier_test>
-int run_work_group_named_barrier_barrier_test(cl_device_id device, cl_context context, cl_command_queue queue,
-                                              size_t count, work_group_named_barrier_test test)
-{
-    cl_mem buffers[1];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_group_size;
-    size_t work_size[1];
-    cl_int err;
-
-    std::string code_str = test.generate_program();
-    std::string kernel_name = test.get_kernel_name();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-#endif
-
-    // Find the max possible wg size for among all the kernels
-    work_group_size = test.get_max_local_size(kernel, device, 256, err);
-    RETURN_ON_ERROR(err);
-    if(work_group_size == 0)
-    {
-        log_info("SKIPPED: Can't produce local size with enough sub-groups. Skipping tests.\n");
-        return CL_SUCCESS;
-    }
-
-    work_size[0] = count;
-    // uniform work-group
-    if(test.enforce_uniform())
-    {
-        size_t wg_number = static_cast<size_t>(
-            std::ceil(static_cast<double>(work_size[0]) / work_group_size)
-        );
-        work_size[0] = wg_number * work_group_size;
-    }
-
-    // host output vector
-    std::vector<cl_uint> output = generate_output<cl_uint>(work_size[0], 9999);
-
-    // device output buffer
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    // Execute test kernels
-    err = test.execute(kernel, buffers[0], queue, work_size[0], work_group_size);
-    RETURN_ON_ERROR(err)
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer")
-
-    // Check output values
-    for(size_t i = 0; i < output.size(); i++)
-    {
-        cl_uint v = test(i, work_group_size, i);
-        if(!(are_equal(v, output[i], ::detail::make_value<cl_uint>(0), test)))
-        {
-            RETURN_ON_ERROR_MSG(-1,
-                "test_%s(%s) failed. Expected: %s, got: %s", test.str().c_str(), type_name<cl_uint>().c_str(),
-                format_value(v).c_str(), format_value(output[i]).c_str()
-            );
-        }
-    }
-    log_info("test_%s(%s) passed\n", test.str().c_str(), type_name<cl_uint>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-} // namespace named_barrier
-
-#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_COMMON_HPP
diff --git a/test_conformance/clcpp/synchronization/named_barrier/test_named_barrier.hpp b/test_conformance/clcpp/synchronization/named_barrier/test_named_barrier.hpp
deleted file mode 100644
index a0f57b24ff..0000000000
--- a/test_conformance/clcpp/synchronization/named_barrier/test_named_barrier.hpp
+++ /dev/null
@@ -1,491 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_NAMED_BARRIER_HPP
-#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_NAMED_BARRIER_HPP
-
-#include "common.hpp"
-
-namespace named_barrier {
-
-struct local_fence_named_barrier_test : public work_group_named_barrier_test_base
-{
-    std::string str()
-    {
-        return "local_fence";
-    }
-
-    // Return value that is expected to be in output_buffer[i]
-    cl_uint operator()(size_t i, size_t work_group_size, size_t max_sub_group_size)
-    {
-        return static_cast<cl_uint>(i);
-    }
-
-    // At the end every work-item writes its global id to ouput[work-item-global-id].
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-            return
-                "__kernel void " + this->get_kernel_name() + "(global uint *output, "
-                                                              "local uint * lmem)\n"
-                "{\n"
-                "  size_t gid = get_global_id(0);\n"
-                "  output[gid] = gid;\n"
-                "}\n";
-
-        #else
-            return
-                "#define cl_khr_subgroup_named_barrier\n"
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_synchronization>\n"
-                "using namespace cl;\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
-                                                              "local_ptr<uint[]> lmem)\n"
-                "{\n\n"
-                "  local<work_group_named_barrier> a(1);\n"
-                "  local<work_group_named_barrier> b(2);\n"
-                "  size_t gid = get_global_id(0);\n"
-                "  size_t lid = get_local_id(0);\n"
-                "  size_t value;\n"
-                "  if(get_num_sub_groups() == 1)\n"
-                "  {\n"
-                "    size_t other_lid = (lid + 1) % get_enqueued_local_size(0);\n"
-                "    size_t other_gid = (gid - lid) + other_lid;\n"
-                "    lmem[other_lid] = other_gid;\n"
-                "    a.wait(mem_fence::local);\n"
-                "    value = lmem[lid];" // lmem[lid] shoule be equal to gid
-                "  }\n"
-                "  else if(get_num_sub_groups() == 2)\n"
-                "  {\n"
-                "    size_t other_lid = (lid + get_max_sub_group_size()) % get_enqueued_local_size(0);\n"
-                "    size_t other_gid = (gid - lid) + other_lid;\n"
-                "    lmem[other_lid] = other_gid;\n"
-                "    b.wait(mem_fence::local);\n"
-                "    value = lmem[lid];" // lmem[lid] shoule be equal to gid
-                "  }\n"
-                "  else if(get_num_sub_groups() > 2)\n"
-                "  {\n"
-                "    if(get_sub_group_id() < 2)\n"
-                "    {\n"
-                "      const size_t two_first_subgroups = 2 * get_max_sub_group_size();"
-                       // local and global id of some work-item outside of work-item subgroup,
-                       // but within subgroups 0 and 1.
-                "      size_t other_lid = (lid + get_max_sub_group_size()) % two_first_subgroups;\n"
-                "      size_t other_gid = (gid - lid) + other_lid;\n"
-                "      lmem[other_lid] = other_gid;\n"
-                "      b.wait(mem_fence::local);\n" // subgroup 0 and 1 are sync (local)
-                "      value = lmem[lid];" // lmem[lid] shoule be equal to gid
-                "    }\n"
-                "    else\n"
-                "    {\n"
-                "      value = gid;\n"
-                "    }\n"
-                "  }\n"
-                "  output[gid] = value;\n"
-                "}\n";
-        #endif
-    }
-
-    size_t get_max_local_size(const cl_kernel kernel,
-                              const cl_device_id device,
-                              const size_t work_group_size, // default work-group size
-                              cl_int& error)
-    {
-        // Set size of the local memory, we need to to this to correctly calculate
-        // max possible work-group size.
-        size_t wg_size;
-        for(wg_size = work_group_size; wg_size > 1; wg_size /= 2)
-        {
-            error = clSetKernelArg(kernel, 1, wg_size * sizeof(cl_uint), NULL);
-            RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-            size_t max_wg_size;
-            error = clGetKernelWorkGroupInfo(
-                kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
-            );
-            RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-            if(max_wg_size >= wg_size) break;
-        }
-        return wg_size;
-    }
-
-    cl_int execute(const cl_kernel kernel,
-                   const cl_mem output_buffer,
-                   const cl_command_queue queue,
-                   const size_t work_size,
-                   const size_t work_group_size)
-    {
-        cl_int err;
-        // Get context from queue
-        cl_context context;
-        err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
-        RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
-
-        err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-        err |= clSetKernelArg(kernel, 1, work_group_size * sizeof(cl_uint), NULL);
-        RETURN_ON_CL_ERROR(err, "clSetKernelArg")
-
-        err = clEnqueueNDRangeKernel(
-            queue, kernel, 1,
-            NULL, &work_size, &work_group_size,
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
-
-        err = clFinish(queue);
-        return err;
-    }
-};
-
-struct global_fence_named_barrier_test : public work_group_named_barrier_test_base
-{
-    std::string str()
-    {
-        return "global_fence";
-    }
-
-    // Return value that is expected to be in output_buffer[i]
-    cl_uint operator()(size_t i, size_t work_group_size, size_t max_sub_group_size)
-    {
-        return static_cast<cl_uint>(i % work_group_size);
-    }
-
-    // At the end every work-item writes its local id to ouput[work-item-global-id].
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-            return
-                "__kernel void " + this->get_kernel_name() + "(global uint * output, "
-                                                              "global uint * temp)\n"
-                "{\n"
-                "size_t gid = get_global_id(0);\n"
-                "output[gid] = get_local_id(0);\n"
-                "}\n";
-
-        #else
-            return
-                "#define cl_khr_subgroup_named_barrier\n"
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_synchronization>\n"
-                "using namespace cl;\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
-                                                              "global_ptr<uint[]> temp)\n"
-                "{\n\n"
-                "  local<work_group_named_barrier> a(1);\n"
-                "  local<work_group_named_barrier> b(2);\n"
-                "  size_t gid = get_global_id(0);\n"
-                "  size_t lid = get_local_id(0);\n"
-                "  size_t value;\n"
-                "  if(get_num_sub_groups() == 1)\n"
-                "  {\n"
-                "    size_t other_lid = (lid + 1) % get_enqueued_local_size(0);\n"
-                "    size_t other_gid = (gid - lid) + other_lid;\n"
-                "    temp[other_gid] = other_lid + 1;\n"
-                "    a.wait(mem_fence::global);\n"
-                "    size_t other_lid_same_subgroup = (lid + 2) % get_sub_group_size();\n"
-                "    size_t other_gid_same_subgroup = (gid - lid) + other_lid_same_subgroup;\n"
-                "    temp[other_gid_same_subgroup] = temp[other_gid_same_subgroup] - 1;\n"
-                "    a.wait(mem_fence::global, memory_scope_sub_group);\n"
-                "    value = temp[gid];" // temp[gid] shoule be equal to lid
-                "  }\n"
-                "  else if(get_num_sub_groups() == 2)\n"
-                "  {\n"
-                "    size_t other_lid = (lid + get_max_sub_group_size()) % get_enqueued_local_size(0);\n"
-                "    size_t other_gid = (gid - lid) + other_lid;\n"
-                "    temp[other_gid] = other_lid + 1;\n"
-                "    b.wait(mem_fence::global);\n" // both subgroups wait, both are sync
-                "    size_t other_lid_same_subgroup = "
-                       "((lid + 1) % get_sub_group_size()) + (get_sub_group_id() * get_sub_group_size());\n"
-                "    size_t other_gid_same_subgroup = (gid - lid) + other_lid_same_subgroup;\n"
-                "    temp[other_gid_same_subgroup] = temp[other_gid_same_subgroup] - 1;\n"
-                "    b.wait(mem_fence::global, memory_scope_sub_group);\n"  // both subgroups wait, sync only within subgroup
-                "    value = temp[gid];" // temp[gid] shoule be equal to lid
-                "  }\n"
-                "  else if(get_num_sub_groups() > 2)\n"
-                "  {\n"
-                "    if(get_sub_group_id() < 2)\n"
-                "    {\n"
-                "      const size_t two_first_subgroups = 2 * get_max_sub_group_size();"
-                       // local and global id of some work-item outside of work-item subgroup,
-                       // but within subgroups 0 and 1.
-                "      size_t other_lid = (lid + get_max_sub_group_size()) % two_first_subgroups;\n"
-                "      size_t other_gid = (gid - lid) + other_lid;\n"
-                "      temp[other_gid] = other_lid + 1;\n"
-                "      b.wait(mem_fence::global);\n" // both subgroups wait, both are sync
-                       // local and global id of some other work-item within work-item subgroup
-                "      size_t other_lid_same_subgroup = "
-                         "((lid + 1) % get_sub_group_size()) + (get_sub_group_id() * get_sub_group_size());\n"
-                "      size_t other_gid_same_subgroup = (gid - lid) + other_lid_same_subgroup;\n"
-                "      temp[other_gid_same_subgroup] = temp[other_gid_same_subgroup] - 1;\n"
-                "      b.wait(mem_fence::global, memory_scope_sub_group);\n" // both subgroups wait, sync only within subgroup
-                "      value = temp[gid];" // temp[gid] shoule be equal to lid
-                "    }\n"
-                "    else\n"
-                "    {\n"
-                "      value = lid;\n"
-                "    }\n"
-                "  }\n"
-                "  output[gid] = value;\n"
-                "}\n";
-        #endif
-    }
-
-    size_t get_max_local_size(const cl_kernel kernel,
-                              const cl_device_id device,
-                              const size_t work_group_size, // default work-group size
-                              cl_int& error)
-    {
-        size_t max_wg_size;
-        error = clGetKernelWorkGroupInfo(
-            kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-        return (std::min)(max_wg_size, work_group_size);
-    }
-
-    cl_int execute(const cl_kernel kernel,
-                   const cl_mem output_buffer,
-                   const cl_command_queue queue,
-                   const size_t work_size,
-                   const size_t work_group_size)
-    {
-        cl_int err;
-        // Get context from queue
-        cl_context context;
-        err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
-        RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
-
-        // create temp buffer
-        auto temp_buffer =
-            clCreateBuffer(context, CL_MEM_READ_WRITE,
-                           sizeof(cl_uint) * work_size, NULL, &err);
-        RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-        err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-        err |= clSetKernelArg(kernel, 1, sizeof(temp_buffer), &temp_buffer);
-        RETURN_ON_CL_ERROR(err, "clSetKernelArg")
-
-        err = clEnqueueNDRangeKernel(
-            queue, kernel, 1,
-            NULL, &work_size, &work_group_size,
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
-
-        err = clFinish(queue);
-        err |= clReleaseMemObject(temp_buffer);
-
-        return err;
-    }
-};
-
-struct global_local_fence_named_barrier_test : public work_group_named_barrier_test_base
-{
-    std::string str()
-    {
-        return "global_local_fence";
-    }
-
-    // Return value that is expected to be in output_buffer[i]
-    cl_uint operator()(size_t i, size_t work_group_size, size_t max_sub_group_size)
-    {
-        return static_cast<cl_uint>(i % work_group_size);
-    }
-
-    // At the end every work-item writes its local id to ouput[work-item-global-id].
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-            return
-                "__kernel void " + this->get_kernel_name() + "(global uint * output, "
-                                                              "global uint * temp,"
-                                                              "local uint * lmem)\n"
-                "{\n"
-                "size_t gid = get_global_id(0);\n"
-                "output[gid] = get_local_id(0);\n"
-                "}\n";
-
-        #else
-            return
-                "#define cl_khr_subgroup_named_barrier\n"
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_synchronization>\n"
-                "using namespace cl;\n"
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
-                                                              "global_ptr<uint[]> temp,"
-                                                              "local_ptr<uint[]> lmem)\n"
-                "{\n\n"
-                "  local<work_group_named_barrier> a(1);\n"
-                "  local<work_group_named_barrier> b(2);\n"
-                "  size_t gid = get_global_id(0);\n"
-                "  size_t lid = get_local_id(0);\n"
-                "  size_t value = 0;\n"
-                "  if(get_num_sub_groups() == 1)\n"
-                "  {\n"
-                "    size_t other_lid = (lid + 1) % get_enqueued_local_size(0);\n"
-                "    size_t other_gid = (gid - lid) + other_lid;\n"
-                "    lmem[other_lid] = other_gid;\n"
-                "    temp[other_gid] = other_lid;\n"
-                "    a.wait(mem_fence::local | mem_fence::global);\n"
-                "    if(lmem[lid] == gid) value = temp[gid];\n"
-                "  }\n"
-                "  else if(get_num_sub_groups() == 2)\n"
-                "  {\n"
-                "    size_t other_lid = (lid + get_max_sub_group_size()) % get_enqueued_local_size(0);\n"
-                "    size_t other_gid = (gid - lid) + other_lid;\n"
-                "    lmem[other_lid] = other_gid;\n"
-                "    temp[other_gid] = other_lid;\n"
-                "    b.wait(mem_fence::local | mem_fence::global);\n"
-                "    if(lmem[lid] == gid) value = temp[gid];\n"
-                "  }\n"
-                "  else if(get_num_sub_groups() > 2)\n"
-                "  {\n"
-                "    if(get_sub_group_id() < 2)\n"
-                "    {\n"
-                "      const size_t two_first_subgroups = 2 * get_max_sub_group_size();"
-                       // local and global id of some work-item outside of work-item subgroup,
-                       // but within subgroups 0 and 1.
-                "      size_t other_lid = (lid + get_max_sub_group_size()) % two_first_subgroups;\n"
-                "      size_t other_gid = (gid - lid) + other_lid;\n"
-                "      lmem[other_lid] = other_gid;\n"
-                "      temp[other_gid] = other_lid;\n"
-                "      b.wait(mem_fence::local | mem_fence::global);\n"
-                "      if(lmem[lid] == gid) value = temp[gid];\n"
-                "    }\n"
-                "    else\n"
-                "    {\n"
-                "      value = lid;\n"
-                "    }\n"
-                "  }\n"
-                "  output[gid] = value;\n"
-                "}\n";
-        #endif
-    }
-
-    size_t get_max_local_size(const cl_kernel kernel,
-                              const cl_device_id device,
-                              const size_t work_group_size, // default work-group size
-                              cl_int& error)
-    {
-        // Set size of the local memory, we need to to this to correctly calculate
-        // max possible work-group size.
-        size_t wg_size;
-        for(wg_size = work_group_size; wg_size > 1; wg_size /= 2)
-        {
-            error = clSetKernelArg(kernel, 2, wg_size * sizeof(cl_uint), NULL);
-            RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-            size_t max_wg_size;
-            error = clGetKernelWorkGroupInfo(
-                kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
-            );
-            RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-            if(max_wg_size >= wg_size) break;
-        }
-        return wg_size;
-    }
-
-    cl_int execute(const cl_kernel kernel,
-                   const cl_mem output_buffer,
-                   const cl_command_queue queue,
-                   const size_t work_size,
-                   const size_t work_group_size)
-    {
-        cl_int err;
-        // Get context from queue
-        cl_context context;
-        err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
-        RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
-
-        // create temp buffer
-        auto temp_buffer =
-            clCreateBuffer(context, CL_MEM_READ_WRITE,
-                           sizeof(cl_uint) * work_size, NULL, &err);
-        RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-        err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-        err |= clSetKernelArg(kernel, 1, sizeof(temp_buffer), &temp_buffer);
-        err |= clSetKernelArg(kernel, 2, work_group_size * sizeof(cl_uint), NULL);
-        RETURN_ON_CL_ERROR(err, "clSetKernelArg")
-
-        err = clEnqueueNDRangeKernel(
-            queue, kernel, 1,
-            NULL, &work_size, &work_group_size,
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
-
-        err = clFinish(queue);
-        err |= clReleaseMemObject(temp_buffer);
-
-        return err;
-    }
-};
-
-// ------------------------------------------------------------------------------
-// -------------------------- RUN TESTS -----------------------------------------
-// ------------------------------------------------------------------------------
-AUTO_TEST_CASE(test_work_group_named_barrier)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-#if !(defined(DEVELOPMENT) && (defined(USE_OPENCLC_KERNELS) || defined(ONLY_SPIRV_COMPILATION)))
-    if(!is_extension_available(device, "cl_khr_subgroup_named_barrier"))
-    {
-        log_info("SKIPPED: Extension `cl_khr_subgroup_named_barrier` is not supported. Skipping tests.\n");
-        return CL_SUCCESS;
-    }
-
-    // An implementation shall support at least 8 named barriers per work-group. The exact
-    // maximum number can be queried using clGetDeviceInfo with CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR
-    // from the OpenCL 2.2 Extension Specification.
-    cl_uint named_barrier_count;
-    error = clGetDeviceInfo(device, CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR, sizeof(cl_uint), &named_barrier_count, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-    if(named_barrier_count < 8)
-    {
-        RETURN_ON_ERROR_MSG(-1, "Maximum number of named barriers must be at least 8.");
-    }
-#endif
-
-    RUN_WG_NAMED_BARRIER_TEST_MACRO(local_fence_named_barrier_test())
-    RUN_WG_NAMED_BARRIER_TEST_MACRO(global_fence_named_barrier_test())
-    RUN_WG_NAMED_BARRIER_TEST_MACRO(global_local_fence_named_barrier_test())
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_NAMED_BARRIER_HPP
diff --git a/test_conformance/clcpp/synchronization/named_barrier/test_spec_example.hpp b/test_conformance/clcpp/synchronization/named_barrier/test_spec_example.hpp
deleted file mode 100644
index 7afbd00f1f..0000000000
--- a/test_conformance/clcpp/synchronization/named_barrier/test_spec_example.hpp
+++ /dev/null
@@ -1,325 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_SPEC_EXAMPLE_HPP
-#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_SPEC_EXAMPLE_HPP
-
-#include "common.hpp"
-
-namespace named_barrier {
-
-// ------------------------------------------------------------------------------
-// ----------------------- SPECIFICATION EXAMPLE TEST----------------------------
-// ------------------------------------------------------------------------------
-// This test is based on the example in OpenCL C++ 1.0 specification (OpenCL C++
-// Standard Library > Synchronization Functions > Named barriers > wait).
-struct spec_example_work_group_named_barrier_test : public work_group_named_barrier_test_base
-{
-    std::string str()
-    {
-        return "spec_example";
-    }
-
-    // Return value that is expected to be in output_buffer[i]
-    cl_uint operator()(size_t i, size_t work_group_size, size_t mas_sub_group_size)
-    {
-        return static_cast<cl_uint>(i);
-    }
-
-    // At the end every work-item writes its global id to ouput[work-item-global-id].
-    std::string generate_program()
-    {
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-            return
-                // In OpenCL C kernel we imitate subgroups by partitioning work-group (based on
-                // local ids of work-items), work_group_named_barrier.wait(..) calls are replaced
-                // with work_group_barriers.
-                "__kernel void " + this->get_kernel_name() + "(global uint *output, "
-                                                              "global uint * temp, "
-                                                              "local uint * lmem)\n"
-                "{\n"
-                "size_t gid = get_global_id(0);\n"
-                "size_t lid = get_local_id(0);\n"
-
-                // We divide work-group into ranges:
-                // [0 - e_wg)[ew_g; q_wg)[q_wg; 3 * ew_g)[3 * ew_g; h_wg)[h_wg; get_local_size(0) - 1]
-                // to simulate 8 subgroups
-                "size_t h_wg = get_local_size(0) / 2;\n" // half of work-group
-                "size_t q_wg = get_local_size(0) / 4;\n" // quarter
-                "size_t e_wg = get_local_size(0) / 8;\n" // one-eighth
-
-                "if(lid < h_wg) lmem[lid] = gid;\n" // [0; h_wg)
-                "else           temp[gid] = gid;\n" // [h_wg; get_local_size(0) - 1)
-                "work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
-
-                "size_t other_lid = (lid + q_wg) % h_wg;\n"
-                "size_t value = 0;\n"
-                "if(lmem[other_lid] == ((gid - lid) + other_lid)){\n"
-                "     value = gid;\n"
-                "}\n"
-                "work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
-
-                "if(lid < q_wg){\n" // [0; q_wg)
-                "    if(lid < e_wg) lmem[lid + e_wg] = gid;\n" // [0; e_wg)
-                "    else           lmem[lid - e_wg] = gid;\n" // [e_wg; q_wg)
-                "}\n"
-                "else if(lid < h_wg) {\n" // [q_wg; h_wg)
-                "    if(lid < (3 * e_wg)) lmem[lid + e_wg] = gid;\n" // [q_ww; q_wg + e_wg)
-                "    else                 lmem[lid - e_wg] = gid;\n" // [q_wg + e_wg; h_wg)
-                "}\n"
-                "work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
-
-                "if(lid < q_wg){\n" // [0; q_wg)
-                "    output[gid + q_wg] = lmem[lid];\n"
-                "}\n"
-                "else if(lid < h_wg) {\n" // [q_wg; h_wg)
-                "    output[gid - q_wg] = lmem[lid];\n"
-                "}\n"
-                "work_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
-
-                "if(lid < q_wg){\n" // [0; q_wg)
-                "    if(lid < e_wg) temp[gid] = output[gid + (3 * e_wg)];\n" // [0; e_wg)
-                "    else           temp[gid] = output[gid + e_wg];\n" // [e_wg; q_wg)
-                "}\n"
-                "else if(lid < h_wg) {\n" // [q_wg; h_wg)
-                "    if(lid < (3 * e_wg)) temp[gid] = output[gid - e_wg];\n"  // [q_ww; q_wg + e_wg)
-                "    else                 temp[gid] = output[gid - (3 * e_wg)];\n"  // [q_wg + e_wg; h_wg)
-                "}\n"
-                "work_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
-
-                "output[gid] = temp[gid];\n"
-                "}\n";
-
-        #else
-            return
-                "#define cl_khr_subgroup_named_barrier\n"
-                "#include <opencl_memory>\n"
-                "#include <opencl_work_item>\n"
-                "#include <opencl_synchronization>\n"
-                "using namespace cl;\n"
-
-                "void b_function(work_group_named_barrier &b, size_t value, local_ptr<uint[]> lmem)\n"
-                "{\n\n"
-                "size_t lid = get_local_id(0);\n"
-                // Work-items from the 1st subgroup writes to local memory that will be
-                // later read byt the 0th subgroup, and the other way around - 0th subgroup
-                // writes what 1st subgroup will later read.
-                // b.wait(mem_fence::local) should provide sync between those two subgroups.
-                "if(get_sub_group_id() < 1) lmem[lid + get_max_sub_group_size()] = value;\n"
-                "else                       lmem[lid - get_max_sub_group_size()] = value;\n"
-                "b.wait(mem_fence::local);\n\n" // sync writes to lmem for 2 subgroups (ids: 0, 1)
-                "}\n"
-
-                "__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
-                                                              "global_ptr<uint[]> temp, "
-                                                              "local_ptr<uint[]> lmem)\n"
-                "{\n\n"
-                "local<work_group_named_barrier> a(4);\n"
-                "local<work_group_named_barrier> b(2);\n"
-                "local<work_group_named_barrier> c(2);\n"
-
-                "size_t gid = get_global_id(0);\n"
-                "size_t lid = get_local_id(0);\n"
-                "if(get_sub_group_id() < 4)"
-                "{\n"
-                "    lmem[lid] = gid;\n"
-                "    a.wait(mem_fence::local);\n" // sync writes to lmem for 4 subgroups (ids: 0, 1, 2, 3)
-                     // Now all four subgroups should see changes in lmem.
-                "    size_t other_lid = (lid + (2 * get_max_sub_group_size())) % (4 * get_max_sub_group_size());\n"
-                "    size_t value = 0;\n"
-                "    if(lmem[other_lid] == ((gid - lid) + other_lid)){\n"
-                "        value = gid;\n"
-                "    }\n"
-                "    a.wait(mem_fence::local);\n" // sync reads from lmem for 4 subgroups (ids: 0, 1, 2, 3)
-
-                "    if(get_sub_group_id() < 2)" // ids: 0, 1
-                "    {\n"
-                "        b_function(b, value, lmem);\n"
-                "    }\n"
-                "    else" // ids: 2, 3
-                "    {\n"
-                         // Work-items from the 2nd subgroup writes to local memory that will be
-                         // later read byt the 3rd subgroup, and the other way around - 3rd subgroup
-                         // writes what 2nd subgroup will later read.
-                         // c.wait(mem_fence::local) should provide sync between those two subgroups.
-                "        if(get_sub_group_id() < 3) lmem[lid + get_max_sub_group_size()] = value ;\n"
-                "        else                       lmem[lid - get_max_sub_group_size()] = value;\n"
-                "        c.wait(mem_fence::local);\n" // sync writes to lmem for 2 subgroups (3, 4)
-                "    }\n"
-
-                     // Now (0, 1) are in sync (local mem), and (3, 4) are in sync (local mem).
-                     // However, subgroups (0, 1) are not in sync with (3, 4).
-                "    if(get_sub_group_id() < 4) {\n" // ids: 0, 1, 2, 3
-                "        if(get_sub_group_id() < 2) output[gid + (2 * get_max_sub_group_size())] = lmem[lid];\n"
-                "        else                       output[gid - (2 * get_max_sub_group_size())] = lmem[lid];\n"
-                "        a.wait(mem_fence::global);\n" // sync writes to global memory (output)
-                                                       // for 4 subgroups (0, 1, 2, 3)
-                "    }\n"
-                "}\n"
-                "else {\n" // subgroups with id > 4
-                "    temp[gid] = gid;\n"
-                "}\n"
-
-                // Now (0, 1, 2, 3) are in sync (global mem)
-                "if(get_sub_group_id() < 2) {\n"
-                "    if(get_sub_group_id() < 1) temp[gid] = output[gid + (3 * get_max_sub_group_size())];\n"
-                "    else                       temp[gid] = output[gid + (get_max_sub_group_size())];\n"
-                "}\n"
-                "else if(get_sub_group_id() < 4) {\n"
-                "    if(get_sub_group_id() < 3) temp[gid] = output[gid - (get_max_sub_group_size())];\n"
-                "    else                       temp[gid] = output[gid - (3 * get_max_sub_group_size())];\n"
-                "}\n"
-
-                // Synchronize the entire work-group (in terms of accesses to global memory)
-                "work_group_barrier(mem_fence::global);\n"
-                "output[gid] = temp[gid];\n\n"
-                "}\n";
-        #endif
-    }
-
-    size_t get_max_local_size(const cl_kernel kernel,
-                              const cl_device_id device,
-                              const size_t work_group_size, // default work-group size
-                              cl_int& error)
-    {
-        // Set size of the local memory, we need to to this to correctly calculate
-        // max possible work-group size.
-        size_t wg_size;
-        for(wg_size = work_group_size; wg_size > 1; wg_size /= 2)
-        {
-            error = clSetKernelArg(kernel, 2, ((wg_size / 2) + 1) * sizeof(cl_uint), NULL);
-            RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-            size_t max_wg_size;
-            error = clGetKernelWorkGroupInfo(
-                kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
-            );
-            RETURN_ON_ERROR(error)
-            if(max_wg_size >= wg_size) break;
-        }
-
-        // -----------------------------------------------------------------------------------
-        // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-        // -----------------------------------------------------------------------------------
-        #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-            // make sure wg_size is a multiple of 8
-            if(wg_size % 8 > 0) wg_size -= (wg_size % 8);
-            return wg_size;
-        #else
-            // make sure that wg_size will produce at least min_num_sub_groups
-            // subgroups in each work-group
-            size_t local_size[3] = { 1, 1, 1 };
-            size_t min_num_sub_groups = 8;
-            error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT,
-                                            sizeof(size_t), &min_num_sub_groups,
-                                            sizeof(size_t) * 3, &local_size, NULL);
-            RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
-            if (local_size[0] == 0 || local_size[1] != 1 || local_size[2] != 1)
-            {
-                if(min_num_sub_groups == 1)
-                {
-                    RETURN_ON_ERROR_MSG(-1, "Can't produce local size with one subgroup")
-                }
-                return 0;
-            }
-            local_size[0] = (std::min)(wg_size, local_size[0]);
-
-            // double-check
-            size_t sub_group_count_for_ndrange;
-            error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE,
-                                            sizeof(size_t) * 3, local_size,
-                                            sizeof(size_t), &sub_group_count_for_ndrange, NULL);
-            RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
-            if (sub_group_count_for_ndrange < min_num_sub_groups)
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE did not return correct value (expected >=%lu, got %lu)",
-                    min_num_sub_groups, sub_group_count_for_ndrange
-                )
-            }
-
-            return local_size[0];
-        #endif
-    }
-
-    cl_int execute(const cl_kernel kernel,
-                   const cl_mem output_buffer,
-                   const cl_command_queue queue,
-                   const size_t work_size,
-                   const size_t work_group_size)
-    {
-        cl_int err;
-        // Get context from queue
-        cl_context context;
-        err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
-        RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
-
-        // create temp buffer
-        auto temp_buffer =
-            clCreateBuffer(context, CL_MEM_READ_WRITE,
-                           sizeof(cl_uint) * work_size, NULL, &err);
-        RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-        err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-        err |= clSetKernelArg(kernel, 1, sizeof(temp_buffer), &temp_buffer);
-        err |= clSetKernelArg(kernel, 2, work_group_size * sizeof(cl_uint), NULL);
-        RETURN_ON_CL_ERROR(err, "clSetKernelArg")
-
-        err = clEnqueueNDRangeKernel(
-            queue, kernel, 1,
-            NULL, &work_size, &work_group_size,
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
-
-        err = clFinish(queue);
-        err |= clReleaseMemObject(temp_buffer);
-
-        return err;
-    }
-};
-
-// ------------------------------------------------------------------------------
-// -------------------------- RUN TESTS -----------------------------------------
-// ------------------------------------------------------------------------------
-AUTO_TEST_CASE(test_work_group_named_barrier_spec_example)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-#if !(defined(DEVELOPMENT) && (defined(USE_OPENCLC_KERNELS) || defined(ONLY_SPIRV_COMPILATION)))
-    if(!is_extension_available(device, "cl_khr_subgroup_named_barrier"))
-    {
-        log_info("SKIPPED: Extension `cl_khr_subgroup_named_barrier` is not supported. Skipping tests.\n");
-        return CL_SUCCESS;
-    }
-#endif
-
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-    RUN_WG_NAMED_BARRIER_TEST_MACRO(spec_example_work_group_named_barrier_test())
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_SPEC_EXAMPLE_HPP
diff --git a/test_conformance/clcpp/synchronization/test_sub_group_barrier.hpp b/test_conformance/clcpp/synchronization/test_sub_group_barrier.hpp
deleted file mode 100644
index c7074ed026..0000000000
--- a/test_conformance/clcpp/synchronization/test_sub_group_barrier.hpp
+++ /dev/null
@@ -1,342 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_SUB_GROUP_BARRIER_HPP
-#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_SUB_GROUP_BARRIER_HPP
-
-#include <sstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <random>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_sub_group_barrier {
-
-enum class barrier_type
-{
-    local,
-    global
-};
-
-struct test_options
-{
-    barrier_type barrier;
-    size_t max_count;
-    size_t num_tests;
-};
-
-const std::string source_common = R"(
-    // Circular shift of sub-group local ids
-    size_t get_shifted_local_id(int sub_group_local_id_delta)
-    {
-        const int sub_group_size = (int)get_sub_group_size();
-        return (get_local_id(0) - get_sub_group_local_id()) +
-            (((int)get_sub_group_local_id() + sub_group_local_id_delta) % sub_group_size + sub_group_size) % sub_group_size;
-    }
-
-    // Get global ids from shifted local ids
-    size_t get_shifted_global_id(int sub_group_local_id_delta)
-    {
-        return get_group_id(0) * get_enqueued_local_size(0) + get_shifted_local_id(sub_group_local_id_delta);
-    }
-)";
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << R"(
-    #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-    )";
-    s << source_common;
-    if (options.barrier == barrier_type::global)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global long *output)
-    {
-        const size_t gid = get_shifted_global_id(0);
-
-        output[gid] = gid;
-        sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_gid = get_shifted_global_id(i);
-
-            output[other_gid] += other_gid;
-            sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
-
-            output[gid] += gid;
-            sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
-        }
-    }
-    )";
-    }
-    else if (options.barrier == barrier_type::local)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global long *output, local long *values)
-    {
-        const size_t gid = get_shifted_global_id(0);
-        const size_t lid = get_shifted_local_id(0);
-
-        values[lid] = gid;
-        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_lid = get_shifted_local_id(i);
-            const size_t other_gid = get_shifted_global_id(i);
-
-            values[other_lid] += other_gid;
-            sub_group_barrier(CLK_LOCAL_MEM_FENCE);
-
-            values[lid] += gid;
-            sub_group_barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        output[gid] = values[lid];
-    }
-    )";
-    }
-
-    return s.str();
-}
-#else
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << R"(
-    #include <opencl_memory>
-    #include <opencl_work_item>
-    #include <opencl_synchronization>
-
-    using namespace cl;
-
-    )";
-    s << source_common;
-
-    if (options.barrier == barrier_type::global)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output)
-    {
-        const size_t gid = get_shifted_global_id(0);
-
-        output[gid] = gid;
-        sub_group_barrier(mem_fence::global);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_gid = get_shifted_global_id(i);
-
-            output[other_gid] += other_gid;
-            sub_group_barrier(mem_fence::global);
-
-            output[gid] += gid;
-            sub_group_barrier(mem_fence::global);
-        }
-    }
-    )";
-    }
-    else if (options.barrier == barrier_type::local)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output, local_ptr<long[]> values)
-    {
-        const size_t gid = get_shifted_global_id(0);
-        const size_t lid = get_shifted_local_id(0);
-
-        values[lid] = gid;
-        sub_group_barrier(mem_fence::local);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_lid = get_shifted_local_id(i);
-            const size_t other_gid = get_shifted_global_id(i);
-
-            values[other_lid] += other_gid;
-            sub_group_barrier(mem_fence::local);
-
-            values[lid] += gid;
-            sub_group_barrier(mem_fence::local);
-        }
-
-        output[gid] = values[lid];
-    }
-    )";
-    }
-
-    return s.str();
-}
-#endif
-
-int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    if (!is_extension_available(device, "cl_khr_subgroups"))
-    {
-        log_info("SKIPPED: Extension `cl_khr_subgroups` is not supported. Skipping tests.\n");
-        return CL_SUCCESS;
-    }
-#endif
-
-    cl_program program;
-    cl_kernel kernel;
-
-    std::string kernel_name = "test";
-    std::string source = generate_source(options);
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name, "-cl-std=CL2.0", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    size_t max_work_group_size;
-    error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_work_group_size), &max_work_group_size, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-
-    if (options.barrier == barrier_type::local)
-    {
-        cl_ulong kernel_local_mem_size;
-        error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernel_local_mem_size), &kernel_local_mem_size, NULL);
-        RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-
-        cl_ulong device_local_mem_size;
-        error = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(device_local_mem_size), &device_local_mem_size, NULL);
-        RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-        max_work_group_size = (std::min<cl_ulong>)(max_work_group_size, (device_local_mem_size - kernel_local_mem_size) / sizeof(cl_long));
-    }
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<size_t> global_size_dis(1, options.max_count);
-    std::uniform_int_distribution<size_t> local_size_dis(1, max_work_group_size);
-    std::uniform_int_distribution<int> iter_dis(0, 20);
-
-    for (size_t test = 0; test < options.num_tests; test++)
-    {
-        const size_t global_size = global_size_dis(gen);
-        const size_t local_size = local_size_dis(gen);
-        const size_t count = global_size;
-
-        const int iter_lo = -iter_dis(gen);
-        const int iter_hi = +iter_dis(gen);
-
-        cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_long) * count, NULL, &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        error = clSetKernelArg(kernel, 0, sizeof(iter_lo), &iter_lo);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 1, sizeof(iter_hi), &iter_hi);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        if (options.barrier == barrier_type::local)
-        {
-            error = clSetKernelArg(kernel, 3, sizeof(cl_long) * local_size, NULL);
-            RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        }
-
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
-        RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-        std::vector<cl_long> output(count);
-        error = clEnqueueReadBuffer(
-            queue, output_buffer, CL_TRUE,
-            0, sizeof(cl_long) * count,
-            static_cast<void *>(output.data()),
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-        error = clReleaseMemObject(output_buffer);
-        RETURN_ON_CL_ERROR(error, "clReleaseMemObject")
-
-        for (size_t gid = 0; gid < count; gid++)
-        {
-            const long value = output[gid];
-            const long expected = gid + 2 * gid * (iter_hi - iter_lo);
-
-            if (value != expected)
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "Element %lu has incorrect value. Expected: %ld, got: %ld",
-                    gid, expected, value
-                );
-            }
-        }
-    }
-
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_sub_group_barrier_global)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.barrier = barrier_type::global;
-    options.num_tests = 1000;
-    options.max_count = num_elements;
-    return test(device, context, queue, options);
-}
-
-AUTO_TEST_CASE(test_sub_group_barrier_local)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.barrier = barrier_type::local;
-    options.num_tests = 1000;
-    options.max_count = num_elements;
-    return test(device, context, queue, options);
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_SUB_GROUP_BARRIER_HPP
diff --git a/test_conformance/clcpp/synchronization/test_work_group_barrier.hpp b/test_conformance/clcpp/synchronization/test_work_group_barrier.hpp
deleted file mode 100644
index aa7fbd2081..0000000000
--- a/test_conformance/clcpp/synchronization/test_work_group_barrier.hpp
+++ /dev/null
@@ -1,330 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_WORK_GROUP_BARRIER_HPP
-#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_WORK_GROUP_BARRIER_HPP
-
-#include <sstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <random>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_work_group_barrier {
-
-enum class barrier_type
-{
-    local,
-    global
-};
-
-struct test_options
-{
-    barrier_type barrier;
-    size_t max_count;
-    size_t num_tests;
-};
-
-const std::string source_common = R"(
-    // Circular shift of local ids
-    size_t get_shifted_local_id(int local_id_delta)
-    {
-        const int local_size = (int)get_local_size(0);
-        return (((int)get_local_id(0) + local_id_delta) % local_size + local_size) % local_size;
-    }
-
-    // Get global ids from shifted local ids
-    size_t get_shifted_global_id(int local_id_delta)
-    {
-        return get_group_id(0) * get_enqueued_local_size(0) + get_shifted_local_id(local_id_delta);
-    }
-)";
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << source_common;
-    if (options.barrier == barrier_type::global)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global long *output)
-    {
-        const size_t gid = get_shifted_global_id(0);
-
-        output[gid] = gid;
-        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_gid = get_shifted_global_id(i);
-
-            output[other_gid] += other_gid;
-            work_group_barrier(CLK_GLOBAL_MEM_FENCE);
-
-            output[gid] += gid;
-            work_group_barrier(CLK_GLOBAL_MEM_FENCE);
-        }
-    }
-    )";
-    }
-    else if (options.barrier == barrier_type::local)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global long *output, local long *values)
-    {
-        const size_t gid = get_shifted_global_id(0);
-        const size_t lid = get_shifted_local_id(0);
-
-        values[lid] = gid;
-        work_group_barrier(CLK_LOCAL_MEM_FENCE);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_lid = get_shifted_local_id(i);
-            const size_t other_gid = get_shifted_global_id(i);
-
-            values[other_lid] += other_gid;
-            work_group_barrier(CLK_LOCAL_MEM_FENCE);
-
-            values[lid] += gid;
-            work_group_barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        output[gid] = values[lid];
-    }
-    )";
-    }
-
-    return s.str();
-}
-#else
-std::string generate_source(test_options options)
-{
-    std::stringstream s;
-    s << R"(
-    #include <opencl_memory>
-    #include <opencl_work_item>
-    #include <opencl_synchronization>
-
-    using namespace cl;
-
-    )";
-    s << source_common;
-
-    if (options.barrier == barrier_type::global)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output)
-    {
-        const size_t gid = get_shifted_global_id(0);
-
-        output[gid] = gid;
-        work_group_barrier(mem_fence::global);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_gid = get_shifted_global_id(i);
-
-            output[other_gid] += other_gid;
-            work_group_barrier(mem_fence::global);
-
-            output[gid] += gid;
-            work_group_barrier(mem_fence::global);
-        }
-    }
-    )";
-    }
-    else if (options.barrier == barrier_type::local)
-    {
-        s << R"(
-    kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output, local_ptr<long[]> values)
-    {
-        const size_t gid = get_shifted_global_id(0);
-        const size_t lid = get_shifted_local_id(0);
-
-        values[lid] = gid;
-        work_group_barrier(mem_fence::local);
-
-        for (int i = iter_lo; i < iter_hi; i++)
-        {
-            const size_t other_lid = get_shifted_local_id(i);
-            const size_t other_gid = get_shifted_global_id(i);
-
-            values[other_lid] += other_gid;
-            work_group_barrier(mem_fence::local);
-
-            values[lid] += gid;
-            work_group_barrier(mem_fence::local);
-        }
-
-        output[gid] = values[lid];
-    }
-    )";
-    }
-
-    return s.str();
-}
-#endif
-
-int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-    cl_program program;
-    cl_kernel kernel;
-
-    std::string kernel_name = "test";
-    std::string source = generate_source(options);
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name, "-cl-std=CL2.0", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    size_t max_work_group_size;
-    error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_work_group_size), &max_work_group_size, NULL);
-    RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-
-    if (options.barrier == barrier_type::local)
-    {
-        cl_ulong kernel_local_mem_size;
-        error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernel_local_mem_size), &kernel_local_mem_size, NULL);
-        RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
-
-        cl_ulong device_local_mem_size;
-        error = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(device_local_mem_size), &device_local_mem_size, NULL);
-        RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
-
-        max_work_group_size = (std::min<cl_ulong>)(max_work_group_size, (device_local_mem_size - kernel_local_mem_size) / sizeof(cl_long));
-    }
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<size_t> global_size_dis(1, options.max_count);
-    std::uniform_int_distribution<size_t> local_size_dis(1, max_work_group_size);
-    std::uniform_int_distribution<int> iter_dis(0, 20);
-
-    for (size_t test = 0; test < options.num_tests; test++)
-    {
-        const size_t global_size = global_size_dis(gen);
-        const size_t local_size = local_size_dis(gen);
-        const size_t count = global_size;
-
-        const int iter_lo = -iter_dis(gen);
-        const int iter_hi = +iter_dis(gen);
-
-        cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_long) * count, NULL, &error);
-        RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-        error = clSetKernelArg(kernel, 0, sizeof(iter_lo), &iter_lo);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 1, sizeof(iter_hi), &iter_hi);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        error = clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer);
-        RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        if (options.barrier == barrier_type::local)
-        {
-            error = clSetKernelArg(kernel, 3, sizeof(cl_long) * local_size, NULL);
-            RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-        }
-
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
-        RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-        std::vector<cl_long> output(count);
-        error = clEnqueueReadBuffer(
-            queue, output_buffer, CL_TRUE,
-            0, sizeof(cl_long) * count,
-            static_cast<void *>(output.data()),
-            0, NULL, NULL
-        );
-        RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-        error = clReleaseMemObject(output_buffer);
-        RETURN_ON_CL_ERROR(error, "clReleaseMemObject")
-
-        for (size_t gid = 0; gid < count; gid++)
-        {
-            const long value = output[gid];
-            const long expected = gid + 2 * gid * (iter_hi - iter_lo);
-
-            if (value != expected)
-            {
-                RETURN_ON_ERROR_MSG(-1,
-                    "Element %lu has incorrect value. Expected: %ld, got: %ld",
-                    gid, expected, value
-                );
-            }
-        }
-    }
-
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-AUTO_TEST_CASE(test_work_group_barrier_global)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.barrier = barrier_type::global;
-    options.num_tests = 1000;
-    options.max_count = num_elements;
-    return test(device, context, queue, options);
-}
-
-AUTO_TEST_CASE(test_work_group_barrier_local)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.barrier = barrier_type::local;
-    options.num_tests = 1000;
-    options.max_count = num_elements;
-    return test(device, context, queue, options);
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_WORK_GROUP_BARRIER_HPP
diff --git a/test_conformance/clcpp/utils_common/errors.hpp b/test_conformance/clcpp/utils_common/errors.hpp
deleted file mode 100644
index c1694626e9..0000000000
--- a/test_conformance/clcpp/utils_common/errors.hpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_ERRORS_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_ERRORS_HPP
-
-#include <string>
-
-#include "../harness/errorHelpers.h"
-
-// ------------- Check OpenCL error helpers (marcos) -----------------
-
-std::string get_cl_error_string(cl_int error)
-{
-#define CASE_CL_ERROR(x) case x: return #x;
-    switch (error)
-    {
-        CASE_CL_ERROR(CL_SUCCESS)
-        CASE_CL_ERROR(CL_DEVICE_NOT_FOUND)
-        CASE_CL_ERROR(CL_DEVICE_NOT_AVAILABLE)
-        CASE_CL_ERROR(CL_COMPILER_NOT_AVAILABLE)
-        CASE_CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE)
-        CASE_CL_ERROR(CL_OUT_OF_RESOURCES)
-        CASE_CL_ERROR(CL_OUT_OF_HOST_MEMORY)
-        CASE_CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE)
-        CASE_CL_ERROR(CL_MEM_COPY_OVERLAP)
-        CASE_CL_ERROR(CL_IMAGE_FORMAT_MISMATCH)
-        CASE_CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED)
-        CASE_CL_ERROR(CL_BUILD_PROGRAM_FAILURE)
-        CASE_CL_ERROR(CL_MAP_FAILURE)
-        CASE_CL_ERROR(CL_MISALIGNED_SUB_BUFFER_OFFSET)
-        CASE_CL_ERROR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
-        CASE_CL_ERROR(CL_COMPILE_PROGRAM_FAILURE)
-        CASE_CL_ERROR(CL_LINKER_NOT_AVAILABLE)
-        CASE_CL_ERROR(CL_LINK_PROGRAM_FAILURE)
-        CASE_CL_ERROR(CL_DEVICE_PARTITION_FAILED)
-        CASE_CL_ERROR(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)
-
-        CASE_CL_ERROR(CL_INVALID_VALUE)
-        CASE_CL_ERROR(CL_INVALID_DEVICE_TYPE)
-        CASE_CL_ERROR(CL_INVALID_PLATFORM)
-        CASE_CL_ERROR(CL_INVALID_DEVICE)
-        CASE_CL_ERROR(CL_INVALID_CONTEXT)
-        CASE_CL_ERROR(CL_INVALID_QUEUE_PROPERTIES)
-        CASE_CL_ERROR(CL_INVALID_COMMAND_QUEUE)
-        CASE_CL_ERROR(CL_INVALID_HOST_PTR)
-        CASE_CL_ERROR(CL_INVALID_MEM_OBJECT)
-        CASE_CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
-        CASE_CL_ERROR(CL_INVALID_IMAGE_SIZE)
-        CASE_CL_ERROR(CL_INVALID_SAMPLER)
-        CASE_CL_ERROR(CL_INVALID_BINARY)
-        CASE_CL_ERROR(CL_INVALID_BUILD_OPTIONS)
-        CASE_CL_ERROR(CL_INVALID_PROGRAM)
-        CASE_CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE)
-        CASE_CL_ERROR(CL_INVALID_KERNEL_NAME)
-        CASE_CL_ERROR(CL_INVALID_KERNEL_DEFINITION)
-        CASE_CL_ERROR(CL_INVALID_KERNEL)
-        CASE_CL_ERROR(CL_INVALID_ARG_INDEX)
-        CASE_CL_ERROR(CL_INVALID_ARG_VALUE)
-        CASE_CL_ERROR(CL_INVALID_ARG_SIZE)
-        CASE_CL_ERROR(CL_INVALID_KERNEL_ARGS)
-        CASE_CL_ERROR(CL_INVALID_WORK_DIMENSION)
-        CASE_CL_ERROR(CL_INVALID_WORK_GROUP_SIZE)
-        CASE_CL_ERROR(CL_INVALID_WORK_ITEM_SIZE)
-        CASE_CL_ERROR(CL_INVALID_GLOBAL_OFFSET)
-        CASE_CL_ERROR(CL_INVALID_EVENT_WAIT_LIST)
-        CASE_CL_ERROR(CL_INVALID_EVENT)
-        CASE_CL_ERROR(CL_INVALID_OPERATION)
-        CASE_CL_ERROR(CL_INVALID_GL_OBJECT)
-        CASE_CL_ERROR(CL_INVALID_BUFFER_SIZE)
-        CASE_CL_ERROR(CL_INVALID_MIP_LEVEL)
-        CASE_CL_ERROR(CL_INVALID_GLOBAL_WORK_SIZE)
-        CASE_CL_ERROR(CL_INVALID_PROPERTY)
-        CASE_CL_ERROR(CL_INVALID_IMAGE_DESCRIPTOR)
-        CASE_CL_ERROR(CL_INVALID_COMPILER_OPTIONS)
-        CASE_CL_ERROR(CL_INVALID_LINKER_OPTIONS)
-        CASE_CL_ERROR(CL_INVALID_DEVICE_PARTITION_COUNT)
-        CASE_CL_ERROR(CL_INVALID_PIPE_SIZE)
-        CASE_CL_ERROR(CL_INVALID_DEVICE_QUEUE)
-        CASE_CL_ERROR(CL_INVALID_SPEC_ID)
-        CASE_CL_ERROR(CL_MAX_SIZE_RESTRICTION_EXCEEDED)
-        default: return "(unknown error code)";
-    }
-#undef CASE_CL_ERROR
-}
-
-#define CHECK_ERROR(x) \
-    if(x != CL_SUCCESS) \
-    { \
-        log_error("ERROR: %d, file: %s, line: %d\n", x, __FILE__, __LINE__);\
-    }
-#define CHECK_ERROR_MSG(x, ...) \
-    if(x != CL_SUCCESS) \
-    { \
-        log_error("ERROR: " __VA_ARGS__);\
-        log_error("\n");\
-        log_error("ERROR: %d, file: %s, line: %d\n", x, __FILE__, __LINE__);\
-    }
-#define RETURN_ON_ERROR(x) \
-    if(x != CL_SUCCESS) \
-    { \
-        log_error("ERROR: %d, file: %s, line: %d\n", x, __FILE__, __LINE__);\
-        return x;\
-    }
-#define RETURN_ON_ERROR_MSG(x, ...) \
-    if(x != CL_SUCCESS) \
-    { \
-        log_error("ERROR: " __VA_ARGS__);\
-        log_error("\n");\
-        log_error("ERROR: %d, file: %s, line: %d\n", x, __FILE__, __LINE__);\
-        return x;\
-    }
-
-#define RETURN_ON_CL_ERROR(x, cl_func_name) \
-    if(x != CL_SUCCESS) \
-    { \
-        log_error("ERROR: %s failed: %s (%d)\n", cl_func_name, get_cl_error_string(x).c_str(), x);\
-        log_error("ERROR: %d, file: %s, line: %d\n", x, __FILE__, __LINE__);\
-        return x;\
-    }
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_ERRORS_HPP
diff --git a/test_conformance/clcpp/utils_common/is_vector_type.hpp b/test_conformance/clcpp/utils_common/is_vector_type.hpp
deleted file mode 100644
index 0232e51374..0000000000
--- a/test_conformance/clcpp/utils_common/is_vector_type.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_IS_VECTOR_TYPE_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_IS_VECTOR_TYPE_HPP
-
-#include "../common.hpp"
-
-// is_vector_type<Type>::value is true if Type is an OpenCL
-// vector type; otherwise - false.
-//
-// Examples: 
-// * is_vector_type<cl_float>::value == false
-// * is_vector_type<cl_float4>::value == true
-template<class Type>
-struct is_vector_type
-{
-    const static bool value = false;
-};
-
-#define ADD_VECTOR_TYPE(Type, n) \
-    template<> \
-    struct is_vector_type<Type ## n> \
-    { \
-        const static bool value = true; \
-    };
-
-#define ADD_VECTOR_TYPES(Type) \
-    ADD_VECTOR_TYPE(Type, 2) \
-    ADD_VECTOR_TYPE(Type, 4) \
-    ADD_VECTOR_TYPE(Type, 8) \
-    ADD_VECTOR_TYPE(Type, 16)
-
-ADD_VECTOR_TYPES(cl_char)
-ADD_VECTOR_TYPES(cl_uchar)
-ADD_VECTOR_TYPES(cl_short)
-ADD_VECTOR_TYPES(cl_ushort)
-ADD_VECTOR_TYPES(cl_int)
-ADD_VECTOR_TYPES(cl_uint)
-ADD_VECTOR_TYPES(cl_long)
-ADD_VECTOR_TYPES(cl_ulong)
-ADD_VECTOR_TYPES(cl_float)
-ADD_VECTOR_TYPES(cl_double)
-
-#undef ADD_VECTOR_TYPES
-#undef ADD_VECTOR_TYPE
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_IS_VECTOR_TYPE_HPP
diff --git a/test_conformance/clcpp/utils_common/kernel_helpers.hpp b/test_conformance/clcpp/utils_common/kernel_helpers.hpp
deleted file mode 100644
index 189b8238fb..0000000000
--- a/test_conformance/clcpp/utils_common/kernel_helpers.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_KERNEL_HELPERS_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_KERNEL_HELPERS_HPP
-
-#include "../common.hpp"
-
-// Creates a OpenCL C++/C program out_program and kernel out_kernel.
-int create_opencl_kernel(cl_context context,
-                         cl_program *out_program,
-                         cl_kernel *out_kernel,
-                         const char *source,
-                         const std::string& kernel_name,
-                         const std::string& build_options = "",
-                         const bool openclCXX = true)
-{
-    return create_single_kernel_helper(
-        context, out_program, out_kernel, 1, &source,
-        kernel_name.c_str(), build_options.c_str(), openclCXX
-    );
-}
-
-int create_opencl_kernel(cl_context context,
-                         cl_program *out_program,
-                         cl_kernel *out_kernel,
-                         const std::string& source,
-                         const std::string& kernel_name,
-                         const std::string& build_options = "",
-                         const bool openclCXX = true)
-{
-    return create_opencl_kernel(
-        context, out_program, out_kernel,
-        source.c_str(), kernel_name, build_options, openclCXX
-    );
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_KERNEL_HELPERS_HPP
diff --git a/test_conformance/clcpp/utils_common/make_vector_type.hpp b/test_conformance/clcpp/utils_common/make_vector_type.hpp
deleted file mode 100644
index 11b11856b7..0000000000
--- a/test_conformance/clcpp/utils_common/make_vector_type.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_MAKE_VECTOR_TYPE_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_MAKE_VECTOR_TYPE_HPP
-
-#include "../common.hpp"
-
-// Using scalar_type and i creates a type scalar_typei.
-// 
-// Example:
-// * make_vector_type<cl_uint, 8>::type is cl_uint8
-// * make_vector_type<cl_uint, 1>::type is cl_uint
-template<class scalar_type, size_t i>
-struct make_vector_type
-{
-    typedef void type;
-};
-
-#define ADD_MAKE_VECTOR_TYPE(Type, n) \
-    template<> \
-    struct make_vector_type<Type, n> \
-    { \
-        typedef Type ## n type; \
-    };
-
-#define ADD_MAKE_VECTOR_TYPES(Type) \
-    template<> \
-    struct make_vector_type<Type, 1> \
-    { \
-        typedef Type type; \
-    }; \
-    ADD_MAKE_VECTOR_TYPE(Type, 2) \
-    ADD_MAKE_VECTOR_TYPE(Type, 3) \
-    ADD_MAKE_VECTOR_TYPE(Type, 4) \
-    ADD_MAKE_VECTOR_TYPE(Type, 8) \
-    ADD_MAKE_VECTOR_TYPE(Type, 16)
-
-ADD_MAKE_VECTOR_TYPES(cl_char)
-ADD_MAKE_VECTOR_TYPES(cl_uchar)
-ADD_MAKE_VECTOR_TYPES(cl_short)
-ADD_MAKE_VECTOR_TYPES(cl_ushort)
-ADD_MAKE_VECTOR_TYPES(cl_int)
-ADD_MAKE_VECTOR_TYPES(cl_uint)
-ADD_MAKE_VECTOR_TYPES(cl_long)
-ADD_MAKE_VECTOR_TYPES(cl_ulong)
-ADD_MAKE_VECTOR_TYPES(cl_float)
-ADD_MAKE_VECTOR_TYPES(cl_double)
-
-#undef ADD_MAKE_VECTOR_TYPES
-#undef ADD_MAKE_VECTOR_TYPE
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_MAKE_VECTOR_TYPE_HPP
diff --git a/test_conformance/clcpp/utils_common/scalar_type.hpp b/test_conformance/clcpp/utils_common/scalar_type.hpp
deleted file mode 100644
index 4c939bb2b3..0000000000
--- a/test_conformance/clcpp/utils_common/scalar_type.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_SCALAR_TYPE_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_SCALAR_TYPE_HPP
-
-#include "../common.hpp"
-
-// scalar_type<Type>::type returns scalar type of Type.
-//
-// Examples:
-// * scalar_type<cl_float>::type is cl_float
-// * scalar_type<cl_float4>::types is cl_float
-template<class Type>
-struct scalar_type
-{
-    typedef void type;
-};
-
-#define ADD_VECTOR_TYPE(Type, n) \
-    template<> \
-    struct scalar_type<Type ## n> \
-    { \
-        typedef Type type; \
-    };
-
-#define ADD_VECTOR_TYPES(Type) \
-    template<> \
-    struct scalar_type<Type> \
-    { \
-        typedef Type type; \
-    }; \
-    ADD_VECTOR_TYPE(Type, 2) \
-    ADD_VECTOR_TYPE(Type, 4) \
-    ADD_VECTOR_TYPE(Type, 8) \
-    ADD_VECTOR_TYPE(Type, 16)
-
-ADD_VECTOR_TYPES(cl_char)
-ADD_VECTOR_TYPES(cl_uchar)
-ADD_VECTOR_TYPES(cl_short)
-ADD_VECTOR_TYPES(cl_ushort)
-ADD_VECTOR_TYPES(cl_int)
-ADD_VECTOR_TYPES(cl_uint)
-ADD_VECTOR_TYPES(cl_long)
-ADD_VECTOR_TYPES(cl_ulong)
-ADD_VECTOR_TYPES(cl_float)
-ADD_VECTOR_TYPES(cl_double)
-
-#undef ADD_VECTOR_TYPES
-#undef ADD_VECTOR_TYPE
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_SCALAR_TYPE_HPP
diff --git a/test_conformance/clcpp/utils_common/string.hpp b/test_conformance/clcpp/utils_common/string.hpp
deleted file mode 100644
index ad5ac9f086..0000000000
--- a/test_conformance/clcpp/utils_common/string.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_STRING_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_STRING_HPP
-
-
-#include <string>
-#include <sstream>
-#include <iomanip>
-#include <type_traits>
-
-#include "is_vector_type.hpp"
-#include "scalar_type.hpp"
-#include "type_name.hpp"
-
-#include "../common.hpp"
-
-
-template<class type>
-std::string format_value(const type& value,
-                         typename std::enable_if<is_vector_type<type>::value>::type* = 0)
-{
-    std::stringstream s;
-    s << type_name<type>() << "{ ";
-    s << std::scientific << std::setprecision(6);
-    for (size_t j = 0; j < vector_size<type>::value; j++)
-    {
-        if (j > 0)
-            s << ", ";
-        s << value.s[j];
-    }
-    s << " }";
-    return s.str();
-}
-
-template<class type>
-std::string format_value(const type& value,
-                         typename std::enable_if<!is_vector_type<type>::value>::type* = 0)
-{
-    std::stringstream s;
-    s << type_name<type>() << "{ ";
-    s << std::scientific << std::setprecision(6);
-    s << value;
-    s << " }";
-    return s.str();
-}
-
-void replace_all(std::string& str, const std::string& from, const std::string& to)
-{
-    size_t start_pos = 0;
-    while((start_pos = str.find(from, start_pos)) != std::string::npos) {
-        str.replace(start_pos, from.length(), to);
-        start_pos += to.length();
-    }
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_STRING_HPP
diff --git a/test_conformance/clcpp/utils_common/type_name.hpp b/test_conformance/clcpp/utils_common/type_name.hpp
deleted file mode 100644
index c66f6e49e5..0000000000
--- a/test_conformance/clcpp/utils_common/type_name.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_TYPE_NAME_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_TYPE_NAME_HPP
-
-#include "../common.hpp"
-
-// Returns type name (in OpenCL device). 
-// cl_uint - "uint", cl_float2 -> "float2"
-template<class Type>
-std::string type_name()
-{
-    return "unknown";
-}
-
-#define ADD_TYPE_NAME(Type, str) \
-    template<> \
-    std::string type_name<Type>() \
-    { \
-        return #str; \
-    }
-
-#define ADD_TYPE_NAME2(Type) \
-    ADD_TYPE_NAME(cl_ ## Type, Type)
-
-#define ADD_TYPE_NAME3(Type, x) \
-    ADD_TYPE_NAME2(Type ## x)
-
-#define ADD_TYPE_NAMES(Type) \
-    ADD_TYPE_NAME2(Type) \
-    ADD_TYPE_NAME3(Type, 2) \
-    ADD_TYPE_NAME3(Type, 4) \
-    ADD_TYPE_NAME3(Type, 8) \
-    ADD_TYPE_NAME3(Type, 16)
-
-ADD_TYPE_NAMES(char)
-ADD_TYPE_NAMES(uchar)
-ADD_TYPE_NAMES(short)
-ADD_TYPE_NAMES(ushort)
-ADD_TYPE_NAMES(int)
-ADD_TYPE_NAMES(uint)
-ADD_TYPE_NAMES(long)
-ADD_TYPE_NAMES(ulong)
-ADD_TYPE_NAMES(float)
-ADD_TYPE_NAMES(double)
-
-#undef ADD_TYPE_NAMES
-#undef ADD_TYPE_NAME3
-#undef ADD_TYPE_NAME2
-#undef ADD_TYPE_NAME
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_TYPE_NAME_HPP
diff --git a/test_conformance/clcpp/utils_common/type_supported.hpp b/test_conformance/clcpp/utils_common/type_supported.hpp
deleted file mode 100644
index 8d4f721b46..0000000000
--- a/test_conformance/clcpp/utils_common/type_supported.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_TYPE_SUPPORTED_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_TYPE_SUPPORTED_HPP
-
-#include "../common.hpp"
-
-// Returns true if type is supported by device; otherwise - false;
-template<class Type>
-bool type_supported(cl_device_id device)
-{
-    (void) device;
-    return false;
-}
-
-#define ADD_SUPPORTED_TYPE(Type) \
-    template<> \
-    bool type_supported<Type>(cl_device_id device) \
-    { \
-        (void) device; \
-        return true; \
-    }
-
-ADD_SUPPORTED_TYPE(cl_char)
-ADD_SUPPORTED_TYPE(cl_uchar)
-ADD_SUPPORTED_TYPE(cl_short)
-ADD_SUPPORTED_TYPE(cl_ushort)
-ADD_SUPPORTED_TYPE(cl_int)
-ADD_SUPPORTED_TYPE(cl_uint)
-
-// ulong
-template<>
-bool type_supported<cl_ulong>(cl_device_id device)
-{
-    // long types do not have to be supported in EMBEDDED_PROFILE.
-    char profile[128];
-    int error;
-
-    error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
-    if (error != CL_SUCCESS)
-    {
-        log_error("ERROR: clGetDeviceInfo failed with CL_DEVICE_PROFILE\n");
-        return false;
-    }
-
-    if (std::strcmp(profile, "EMBEDDED_PROFILE") == 0)
-        return is_extension_available(device, "cles_khr_int64");
-
-    return true;
-}
-// long
-template<>
-bool type_supported<cl_long>(cl_device_id device)
-{
-    return type_supported<cl_ulong>(device);
-}
-ADD_SUPPORTED_TYPE(cl_float)
-// double
-template<>
-bool type_supported<cl_double>(cl_device_id device)
-{
-    return is_extension_available(device, "cl_khr_fp64");
-}
-
-#define ADD_SUPPORTED_VEC_TYPE1(Type, n) \
-    template<> \
-    bool type_supported<Type ## n>(cl_device_id device) \
-    { \
-        return type_supported<Type>(device); \
-    }
-
-#define ADD_SUPPORTED_VEC_TYPE2(Type) \
-    ADD_SUPPORTED_VEC_TYPE1(Type, 2) \
-    ADD_SUPPORTED_VEC_TYPE1(Type, 4) \
-    ADD_SUPPORTED_VEC_TYPE1(Type, 8) \
-    ADD_SUPPORTED_VEC_TYPE1(Type, 16)
-
-ADD_SUPPORTED_VEC_TYPE2(cl_char)
-ADD_SUPPORTED_VEC_TYPE2(cl_uchar)
-ADD_SUPPORTED_VEC_TYPE2(cl_short)
-ADD_SUPPORTED_VEC_TYPE2(cl_ushort)
-ADD_SUPPORTED_VEC_TYPE2(cl_int)
-ADD_SUPPORTED_VEC_TYPE2(cl_uint)
-ADD_SUPPORTED_VEC_TYPE2(cl_long)
-ADD_SUPPORTED_VEC_TYPE2(cl_ulong)
-ADD_SUPPORTED_VEC_TYPE2(cl_float)
-// ADD_SUPPORTED_VEC_TYPE2(cl_double)
-
-#undef ADD_SUPPORTED_VEC_TYPE2
-#undef ADD_SUPPORTED_VEC_TYPE1
-#undef ADD_SUPPORTED_TYPE
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_TYPE_SUPPORTED_HPP
diff --git a/test_conformance/clcpp/utils_common/vector_size.hpp b/test_conformance/clcpp/utils_common/vector_size.hpp
deleted file mode 100644
index 4817506e47..0000000000
--- a/test_conformance/clcpp/utils_common/vector_size.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_COMMON_VECTOR_SIZE_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_COMMON_VECTOR_SIZE_HPP
-
-#include "../common.hpp"
-
-// Returns 1 if Type is a scalar type; otherwise if it's a vector type, 
-// it returns number of components in that Type. 
-template<class Type>
-struct vector_size
-{
-    const static size_t value = 1;
-};
-
-#define ADD_VECTOR_SIZE_TYPE(Type, n) \
-    template<> \
-    struct vector_size<Type ## n> \
-    { \
-        const static size_t value = n; \
-    };
-
-#define ADD_VECTOR_SIZE_TYPES(Type) \
-    template<> \
-    struct vector_size<Type> \
-    { \
-        const static size_t value = 1; \
-    }; \
-    ADD_VECTOR_SIZE_TYPE(Type, 2) \
-    ADD_VECTOR_SIZE_TYPE(Type, 4) \
-    ADD_VECTOR_SIZE_TYPE(Type, 8) \
-    ADD_VECTOR_SIZE_TYPE(Type, 16)
-
-ADD_VECTOR_SIZE_TYPES(cl_char)
-ADD_VECTOR_SIZE_TYPES(cl_uchar)
-ADD_VECTOR_SIZE_TYPES(cl_short)
-ADD_VECTOR_SIZE_TYPES(cl_ushort)
-ADD_VECTOR_SIZE_TYPES(cl_int)
-ADD_VECTOR_SIZE_TYPES(cl_uint)
-ADD_VECTOR_SIZE_TYPES(cl_long)
-ADD_VECTOR_SIZE_TYPES(cl_ulong)
-ADD_VECTOR_SIZE_TYPES(cl_float)
-ADD_VECTOR_SIZE_TYPES(cl_double)
-
-#undef ADD_VECTOR_SIZE_TYPES
-#undef ADD_VECTOR_SIZE_TYPE
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_COMMON_VECTOR_SIZE_HPP
diff --git a/test_conformance/clcpp/utils_test/binary.hpp b/test_conformance/clcpp/utils_test/binary.hpp
deleted file mode 100644
index 893cbed09b..0000000000
--- a/test_conformance/clcpp/utils_test/binary.hpp
+++ /dev/null
@@ -1,305 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_TEST_BINARY_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_TEST_BINARY_HPP
-
-#include <type_traits>
-#include <algorithm>
-#include <string>
-#include <cmath>
-
-#include "../common.hpp"
-
-#include "detail/base_func_type.hpp"
-#include "generate_inputs.hpp"
-#include "compare.hpp"
-
-template<class IN1, class IN2, class OUT1>
-struct binary_func : public detail::base_func_type<OUT1>
-{
-    typedef IN1 in1_type;
-    typedef IN2 in2_type;
-    typedef OUT1 out_type;
-
-    virtual ~binary_func() {};
-    virtual std::string str() = 0;
-
-    std::string decl_str()
-    {
-        return type_name<OUT1>() + "(" + type_name<IN1>() + ", " + type_name<IN2>() + ")";
-    }
-
-    bool is_in1_bool()
-    {
-        return false;
-    }
-
-    bool is_in2_bool()
-    {
-        return false;
-    }
-
-    IN1 min1()
-    {
-        return detail::get_min<IN1>();
-    }
-
-    IN1 max1()
-    {
-        return detail::get_max<IN1>();
-    }
-
-    IN2 min2()
-    {
-        return detail::get_min<IN2>();
-    }
-
-    IN2 max2()
-    {
-        return detail::get_max<IN2>();
-    }
-
-    std::vector<IN1> in1_special_cases()
-    {
-        return { };
-    }
-
-    std::vector<IN2> in2_special_cases()
-    {
-        return { };
-    }
-
-    template<class T>
-    typename make_vector_type<cl_double, vector_size<T>::value>::type
-    delta(const IN1& in1, const IN2& in2, const T& expected)
-    {
-        typedef
-            typename make_vector_type<cl_double, vector_size<T>::value>::type
-            delta_vector_type;
-        // Take care of unused variable warning
-        (void) in1;
-        (void) in2;
-        auto e = detail::make_value<delta_vector_type>(1e-3);
-        return detail::multiply<delta_vector_type>(e, expected);
-    }
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class in1_type, class in2_type, class out_type>
-std::string generate_kernel_binary(func_type func)
-{
-    std::string in1_value = "input1[gid]";
-    if(func.is_in1_bool())
-    {
-        std::string i = vector_size<in1_type>::value == 1 ? "" : std::to_string(vector_size<in1_type>::value);
-        in1_value = "(input1[gid] != (int" + i + ")(0))";
-    }
-    std::string in2_value = "input2[gid]";
-    if(func.is_in2_bool())
-    {
-        std::string i = vector_size<in2_type>::value == 1 ? "" : std::to_string(vector_size<in2_type>::value);
-        in2_value = "(input2[gid] != (int" + i + ")(0))";
-    }
-    std::string function_call = func.str() + "(" + in1_value + ", " + in2_value + ")";
-    if(func.is_out_bool())
-    {
-        std::string i = vector_size<out_type>::value == 1 ? "" : std::to_string(vector_size<out_type>::value);
-        function_call = "convert_int" + i + "(" + func.str() + "(" + in1_value + ", " + in2_value + "))";
-    }
-    return
-        "__kernel void " + func.get_kernel_name() + "(global " + type_name<in1_type>() + " *input1,\n"
-        "                                      global " + type_name<in2_type>() + " *input2,\n"
-        "                                      global " + type_name<out_type>() + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#else
-template <class func_type, class in1_type, class in2_type, class out_type>
-std::string generate_kernel_binary(func_type func)
-{
-    std::string headers = func.headers();
-    std::string in1_value = "input1[gid]";
-    if(func.is_in1_bool())
-    {
-        std::string i = vector_size<in1_type>::value == 1 ? "" : std::to_string(vector_size<in1_type>::value);
-        in1_value = "(input1[gid] != (int" + i + ")(0))";
-    }
-    std::string in2_value = "input2[gid]";
-    if(func.is_in2_bool())
-    {
-        std::string i = vector_size<in2_type>::value == 1 ? "" : std::to_string(vector_size<in2_type>::value);
-        in2_value = "(input2[gid] != (int" + i + ")(0))";
-    }
-    std::string function_call = func.str() + "(" + in1_value + ", " + in2_value + ")";
-    if(func.is_out_bool())
-    {
-        std::string i = vector_size<out_type>::value == 1 ? "" : std::to_string(vector_size<out_type>::value);
-        function_call = "convert_cast<int" + i + ">(" + func.str() + "(" + in1_value + ", " + in2_value + "))";
-    }
-    if(func.is_out_bool() || func.is_in1_bool() || func.is_in2_bool())
-    {
-        if(headers.find("#include <opencl_convert>") == std::string::npos)
-        {
-            headers += "#include <opencl_convert>\n";
-        }
-    }
-    return
-        "" + func.defs() +
-        "" + headers +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void " + func.get_kernel_name() + "(global_ptr<" + type_name<in1_type>() + "[]> input1,\n"
-        "                                      global_ptr<" + type_name<in2_type>() + "[]> input2,\n"
-        "                                      global_ptr<" + type_name<out_type>() + "[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#endif
-
-template<class INPUT1, class INPUT2, class OUTPUT, class binary_op>
-bool verify_binary(const std::vector<INPUT1> &in1,
-                   const std::vector<INPUT2> &in2,
-                   const std::vector<OUTPUT> &out,
-                   binary_op op)
-{
-    for(size_t i = 0; i < in1.size(); i++)
-    {
-        auto expected = op(in1[i], in2[i]);
-        if(!are_equal(expected, out[i], op.delta(in1[i], in2[i], expected), op))
-        {
-            print_error_msg(expected, out[i], i, op);
-            return false;
-        }
-    }
-    return true;
-}
-
-template <class binary_op>
-int test_binary_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, binary_op op)
-{
-    cl_mem buffers[3];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int err;
-
-    typedef typename binary_op::in1_type INPUT1;
-    typedef typename binary_op::in2_type INPUT2;
-    typedef typename binary_op::out_type OUTPUT;
-
-    // Don't run test for unsupported types
-    if(!(type_supported<INPUT1>(device)
-         && type_supported<INPUT2>(device)
-         && type_supported<OUTPUT>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_binary<binary_op, INPUT1, INPUT2, OUTPUT>(op);
-    std::string kernel_name = op.get_kernel_name();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-#endif
-
-    std::vector<INPUT1> in1_spec_cases = op.in1_special_cases();
-    std::vector<INPUT2> in2_spec_cases = op.in2_special_cases();
-    prepare_special_cases(in1_spec_cases, in2_spec_cases);
-    std::vector<INPUT1> input1 = generate_input<INPUT1>(count, op.min1(), op.max1(), in1_spec_cases);
-    std::vector<INPUT2> input2 = generate_input<INPUT2>(count, op.min2(), op.max2(), in2_spec_cases);
-    std::vector<OUTPUT> output = generate_output<OUTPUT>(count);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT1) * input1.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT2) * input2.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    buffers[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(OUTPUT) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(INPUT1) * input1.size(),
-        static_cast<void *>(input1.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer")
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(INPUT2) * input2.size(),
-        static_cast<void *>(input2.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer")
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    err |= clSetKernelArg(kernel, 2, sizeof(buffers[2]), &buffers[2]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    work_size[0] = count;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[2], CL_TRUE, 0, sizeof(OUTPUT) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (!verify_binary(input1, input2, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "test_%s %s(%s, %s) failed", op.str().c_str(),
-            type_name<OUTPUT>().c_str(), type_name<INPUT1>().c_str(), type_name<INPUT2>().c_str()
-        );
-    }
-    log_info(
-        "test_%s %s(%s, %s) passed\n", op.str().c_str(),
-        type_name<OUTPUT>().c_str(), type_name<INPUT1>().c_str(), type_name<INPUT2>().c_str()
-    );
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseMemObject(buffers[2]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_BINARY_HPP
diff --git a/test_conformance/clcpp/utils_test/compare.hpp b/test_conformance/clcpp/utils_test/compare.hpp
deleted file mode 100644
index a22b88fd21..0000000000
--- a/test_conformance/clcpp/utils_test/compare.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_TEST_COMPARE_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_TEST_COMPARE_HPP
-
-#include <random>
-#include <limits>
-#include <type_traits>
-#include <algorithm>
-
-#include <cmath>
-
-#include "../common.hpp"
-
-// Checks if x is equal to y.
-template<class type, class delta_type, class op_type>
-inline bool are_equal(const type& x,
-                      const type& y,
-                      const delta_type& delta,
-                      op_type op,
-                      typename std::enable_if<
-                        is_vector_type<type>::value
-                        && std::is_integral<typename scalar_type<type>::type>::value
-                      >::type* = 0)
-{
-    (void) delta;
-    for(size_t i = 0; i < vector_size<type>::value; i++)
-    {
-        if(op.is_out_bool())
-        {
-            if(!((x.s[i] != 0) == (y.s[i] != 0)))
-            {
-                return false;
-            }
-        }
-        else if(!(x.s[i] == y.s[i]))
-        {
-            return false;
-        }
-    }
-    return true;
-}
-
-template<class type, class delta_type, class op_type>
-inline bool are_equal(const type& x,
-                      const type& y,
-                      const delta_type& delta,
-                      op_type op,
-                      typename std::enable_if<
-                        !is_vector_type<type>::value
-                        && std::is_integral<type>::value
-                      >::type* = 0)
-{
-    (void) delta;
-    if(op.is_out_bool())
-    {
-        if(!((x != 0) == (y != 0)))
-        {
-            return false;
-        }
-    }
-    return x == y;
-}
-
-template<class type, class type1, class type2, class op_type>
-inline bool are_equal(const type& x,
-                      const type1& y,
-                      const type2& delta,
-                      op_type op,
-                      typename std::enable_if<
-                        !is_vector_type<type>::value
-                        && std::is_floating_point<type>::value
-                      >::type* = 0)
-{
-    // x - expected
-    // y - result
-
-    // INFO:
-    // Whe don't care about subnormal values in OpenCL C++ tests
-    if(std::fpclassify(static_cast<type1>(x)) == FP_SUBNORMAL || std::fpclassify(y) == FP_SUBNORMAL)
-    {
-        return true;
-    }
-
-    // both are NaN
-    if((std::isnan)(static_cast<type1>(x)) && (std::isnan)(y))
-    {
-        return true;
-    }
-    // one is NaN
-    else if((std::isnan)(static_cast<type1>(x)) || (std::isnan)(y))
-    {
-        return false;
-    }
-
-    // Check for perfect match, it also covers inf, -inf
-    if(static_cast<type1>(x) != y)
-    {
-        // Check if values are close
-        if(std::abs(static_cast<type1>(x) - y) > (std::max)(std::numeric_limits<type2>::epsilon(), std::abs(delta)))
-        {
-            return false;
-        }
-        // Check ulp
-        if(op.use_ulp())
-        {
-            return !(std::abs(Ulp_Error(x, y)) > op.ulp());
-        }
-    }
-    return true;
-}
-
-template<class type, class type1, class type2, class op_type>
-inline bool are_equal(const type& x,
-                      const type1& y,
-                      const type2& delta,
-                      op_type op,
-                      typename std::enable_if<
-                        is_vector_type<type>::value
-                        && std::is_floating_point<typename scalar_type<type>::type>::value
-                      >::type* = 0)
-{
-    // x - expected
-    // y - result
-    for(size_t i = 0; i < vector_size<type>::value; i++)
-    {
-        if(!are_equal(x.s[i], y.s[i], delta.s[i], op))
-        {
-            return false;
-        }
-    }
-    return true;
-}
-
-template<class type, class type1, class func>
-inline void print_error_msg(const type& expected, const type1& result, size_t i, func op)
-{
-    log_error(
-        "ERROR: test_%s %s failed. Error at %lu: Expected: %s, got: %s\n",
-        op.str().c_str(),
-        op.decl_str().c_str(),
-        i,
-        format_value(expected).c_str(),
-        format_value(result).c_str()
-    );
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_COMPARE_HPP
diff --git a/test_conformance/clcpp/utils_test/detail/base_func_type.hpp b/test_conformance/clcpp/utils_test/detail/base_func_type.hpp
deleted file mode 100644
index 92e375d008..0000000000
--- a/test_conformance/clcpp/utils_test/detail/base_func_type.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_TEST_DETAIL_BASE_FUNC_TYPE_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_TEST_DETAIL_BASE_FUNC_TYPE_HPP
-
-#include <random>
-#include <limits>
-#include <type_traits>
-#include <algorithm>
-
-#include <cmath>
-
-#include "../../common.hpp"
-
-#include "vec_helpers.hpp"
-
-namespace detail
-{
-
-template<class OUT1>
-struct base_func_type
-{   
-    virtual ~base_func_type() {};
-
-    // Returns function name
-    virtual std::string str() = 0;
-
-    // Returns name of the test kernel for that function
-    virtual std::string get_kernel_name()
-    {
-        std::string kn = this->str();
-        replace_all(kn, "::", "_");
-        return "test_" + kn;
-    }
-
-    // Returns required defines and pragmas.
-    virtual std::string defs()
-    {
-        return "";
-    }
-
-    // Returns required OpenCL C++ headers.
-    virtual std::string headers()
-    {
-        return "";
-    }
-
-    // Return true if OUT1 type in OpenCL kernel should be treated
-    // as bool type; false otherwise.
-    bool is_out_bool()
-    {
-        return false;
-    }
-
-    // Max ULP error, that is error should be raised when
-    // if Ulp_Error(result, expected) > ulp()
-    float ulp()
-    {
-        return 0.0f;
-    }
-
-    // Should we check ULP error when verifing if the result is
-    // correct? 
-    //
-    // (This effects how are_equal() function works, 
-    // it may not have effect if verify() method in derived
-    // class does not use are_equal() function.)
-    //
-    // Only for FP numbers/vectors
-    bool use_ulp()
-    {
-        return true;
-    }
-
-    // Max error. Error should be raised if
-    // abs(result - expected) > delta(.., expected)
-    //
-    // Default value: 0.001 * expected
-    //
-    // (This effects how are_equal() function works, 
-    // it may not have effect if verify() method in derived
-    // class does not use are_equal() function.)
-    //
-    // Only for FP numbers/vectors
-    template<class T>
-    typename make_vector_type<cl_double, vector_size<T>::value>::type
-    delta(const T& expected)
-    {
-        typedef 
-            typename make_vector_type<cl_double, vector_size<T>::value>::type
-            delta_vector_type;
-        auto e = detail::make_value<delta_vector_type>(1e-3);
-        return detail::multiply<delta_vector_type>(e, expected);
-    }
-};
-
-} // detail namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_DETAIL_BASE_FUNC_TYPE_HPP
diff --git a/test_conformance/clcpp/utils_test/detail/vec_helpers.hpp b/test_conformance/clcpp/utils_test/detail/vec_helpers.hpp
deleted file mode 100644
index 05df42aacd..0000000000
--- a/test_conformance/clcpp/utils_test/detail/vec_helpers.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_TEST_DETAIL_VEC_HELPERS_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_TEST_DETAIL_VEC_HELPERS_HPP
-
-#include <random>
-#include <limits>
-#include <type_traits>
-#include <algorithm>
-
-#include <cmath>
-
-#include "../../common.hpp"
-
-namespace detail
-{
-
-template<class T>
-T make_value(typename scalar_type<T>::type x, typename std::enable_if<is_vector_type<T>::value>::type* = 0)
-{
-    T value;
-    for(size_t i = 0; i < vector_size<T>::value; i++)
-    {
-        value.s[i] = x;
-    }
-    return value;
-}
-
-template<class T>
-T make_value(T x, typename std::enable_if<!is_vector_type<T>::value>::type* = 0)
-{
-    return x;
-}
-
-template<class result_type, class IN1, class IN2>
-result_type multiply(const IN1& x, const IN2& y, typename std::enable_if<is_vector_type<result_type>::value>::type* = 0)
-{
-    static_assert(
-        (vector_size<IN1>::value == vector_size<IN2>::value)
-            && (vector_size<IN2>::value == vector_size<result_type>::value),
-        "Vector sizes must be the same."
-    );
-    typedef typename scalar_type<result_type>::type SCALAR;
-    result_type value;
-    for(size_t i = 0; i < vector_size<result_type>::value; i++)
-    {
-        value.s[i] = static_cast<SCALAR>(x.s[i]) * static_cast<SCALAR>(y.s[i]);
-    }
-    return value;
-}
-
-template<class result_type, class IN1, class IN2>
-result_type multiply(const IN1& x, const IN2& y, typename std::enable_if<!is_vector_type<result_type>::value>::type* = 0)
-{
-    static_assert(
-        !is_vector_type<IN1>::value && !is_vector_type<IN2>::value,
-        "IN1 and IN2 must be scalar types"
-    );
-    return static_cast<result_type>(x) * static_cast<result_type>(y);
-}
-
-template<class T>
-T get_min()
-{
-    typedef typename scalar_type<T>::type SCALAR;
-    return make_value<T>((std::numeric_limits<SCALAR>::min)());
-}
-
-template<class T>
-T get_max()
-{
-    typedef typename scalar_type<T>::type SCALAR;
-    return make_value<T>((std::numeric_limits<SCALAR>::max)());
-}
-
-template<class T>
-T get_part_max(typename scalar_type<T>::type x)
-{
-    typedef typename scalar_type<T>::type SCALAR;
-    return make_value<T>((std::numeric_limits<SCALAR>::max)() / x);
-}
-
-template<class T>
-T def_limit(typename scalar_type<T>::type x)
-{
-    return make_value<T>(x);
-}
-
-} // detail namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_DETAIL_VEC_HELPERS_HPP
diff --git a/test_conformance/clcpp/utils_test/generate_inputs.hpp b/test_conformance/clcpp/utils_test/generate_inputs.hpp
deleted file mode 100644
index bb0d750656..0000000000
--- a/test_conformance/clcpp/utils_test/generate_inputs.hpp
+++ /dev/null
@@ -1,331 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_TEST_GENERATE_INPUTS_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_TEST_GENERATE_INPUTS_HPP
-
-#include <random>
-#include <limits>
-#include <type_traits>
-#include <algorithm>
-
-#include <cmath>
-
-#include "../common.hpp"
-
-template <class type>
-std::vector<type> generate_input(size_t count,
-                                 const type& min,
-                                 const type& max,
-                                 const std::vector<type> special_cases,
-                                 typename std::enable_if<
-                                    is_vector_type<type>::value
-                                    && std::is_integral<typename scalar_type<type>::type>::value
-                                    // std::uniform_int_distribution<> does not work in VS2015 for cl_uchar and cl_char,
-                                    // because VS2015 thinks that use cl_int, because VS2015 thinks cl_uchar cl_char are
-                                    // not int types
-                                    && !(std::is_same<typename scalar_type<type>::type, cl_uchar>::value
-                                         || std::is_same<typename scalar_type<type>::type, cl_char>::value)
-                                 >::type* = 0)
-{
-    typedef typename scalar_type<type>::type SCALAR;
-    const size_t vec_size = vector_size<type>::value;
-
-    std::vector<type> input(count);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::vector<std::uniform_int_distribution<SCALAR>> dists(vec_size);
-    for(size_t i = 0; i < vec_size; i++)
-    {
-        dists[i] = std::uniform_int_distribution<SCALAR>(min.s[i], max.s[i]);
-    }
-    for(auto& i : input)
-    {
-        for(size_t j = 0; j < vec_size; j++)
-        {
-            i.s[j] = dists[j](gen);
-        }
-    }
-
-    input.insert(input.begin(), special_cases.begin(), special_cases.end());
-    input.resize(count);
-    return input;
-}
-
-template <class type>
-std::vector<type> generate_input(size_t count,
-                                 const type& min,
-                                 const type& max,
-                                 const std::vector<type> special_cases,
-                                 typename std::enable_if<
-                                    is_vector_type<type>::value
-                                    && std::is_integral<typename scalar_type<type>::type>::value
-                                    // std::uniform_int_distribution<> does not work in VS2015 for cl_uchar and cl_char,
-                                    // because VS2015 thinks that use cl_int, because VS2015 thinks cl_uchar cl_char are
-                                    // not int types
-                                    && (std::is_same<typename scalar_type<type>::type, cl_uchar>::value
-                                        || std::is_same<typename scalar_type<type>::type, cl_char>::value)
-                                 >::type* = 0)
-{
-    typedef typename scalar_type<type>::type SCALAR;
-    const size_t vec_size = vector_size<type>::value;
-
-    std::vector<type> input(count);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::vector<std::uniform_int_distribution<cl_int>> dists(vec_size);
-    for(size_t i = 0; i < vec_size; i++)
-    {
-        dists[i] = std::uniform_int_distribution<cl_int>(
-            static_cast<cl_int>(min.s[i]),
-            static_cast<cl_int>(max.s[i])
-        );
-    }
-    for(auto& i : input)
-    {
-        for(size_t j = 0; j < vec_size; j++)
-        {
-            i.s[j] = static_cast<SCALAR>(dists[j](gen));
-        }
-    }
-
-    input.insert(input.begin(), special_cases.begin(), special_cases.end());
-    input.resize(count);
-    return input;
-}
-
-
-template <class type>
-std::vector<type> generate_input(size_t count,
-                                 const type& min,
-                                 const type& max,
-                                 const std::vector<type> special_cases,
-                                 typename std::enable_if<
-                                    !is_vector_type<type>::value
-                                    && std::is_integral<type>::value
-                                    // std::uniform_int_distribution<> does not work in VS2015 for cl_uchar and cl_char,
-                                    // because VS2015 thinks that use cl_int, because VS2015 thinks cl_uchar cl_char are
-                                    // not int types
-                                    && !(std::is_same<type, cl_uchar>::value || std::is_same<type, cl_char>::value)
-                                 >::type* = 0)
-{
-    std::vector<type> input(count);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<type> dis(min, max);
-    for(auto& i : input)
-    {
-        i = dis(gen);
-    }
-
-    input.insert(input.begin(), special_cases.begin(), special_cases.end());
-    input.resize(count);
-    return input;
-}
-
-template <class type>
-std::vector<type> generate_input(size_t count,
-                                 const type& min,
-                                 const type& max,
-                                 const std::vector<type> special_cases,
-                                 typename std::enable_if<
-                                    !is_vector_type<type>::value
-                                    && std::is_integral<type>::value
-                                    // std::uniform_int_distribution<> does not work in VS2015 for cl_uchar and cl_char,
-                                    // because VS2015 thinks that use cl_int, because VS2015 thinks cl_uchar cl_char are
-                                    // not int types
-                                    && (std::is_same<type, cl_uchar>::value || std::is_same<type, cl_char>::value)
-                                 >::type* = 0)
-{
-    std::vector<type> input(count);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<cl_int> dis(
-        static_cast<cl_int>(min), static_cast<cl_int>(max)
-    );
-    for(auto& i : input)
-    {
-        i = static_cast<type>(dis(gen));
-    }
-
-    input.insert(input.begin(), special_cases.begin(), special_cases.end());
-    input.resize(count);
-    return input;
-}
-
-template <class type>
-std::vector<type> generate_input(size_t count,
-                                 const type& min,
-                                 const type& max,
-                                 const std::vector<type> special_cases,
-                                 typename std::enable_if<
-                                    is_vector_type<type>::value
-                                    && std::is_floating_point<typename scalar_type<type>::type>::value
-                                 >::type* = 0)
-{
-    typedef typename scalar_type<type>::type SCALAR;
-    const size_t vec_size = vector_size<type>::value;
-
-    std::vector<type> input(count);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::vector<std::uniform_real_distribution<SCALAR>> dists(vec_size);
-    for(size_t i = 0; i < vec_size; i++)
-    {
-        // Fatal error
-        if(std::fpclassify(max.s[i]) == FP_SUBNORMAL || std::fpclassify(min.s[i]) == FP_SUBNORMAL)
-        {
-            log_error("ERROR: min and max value for input generation CAN NOT BE subnormal\n");
-        }
-        dists[i] = std::uniform_real_distribution<SCALAR>(min.s[i], max.s[i]);
-    }
-    for(auto& i : input)
-    {
-        for(size_t j = 0; j < vec_size; j++)
-        {
-            SCALAR x = dists[j](gen);
-            while(std::fpclassify(x) == FP_SUBNORMAL)
-            {
-                x = dists[j](gen);
-            }
-            i.s[j] = x;
-        }
-    }
-
-    input.insert(input.begin(), special_cases.begin(), special_cases.end());
-    input.resize(count);
-    return input;
-}
-
-template <class type>
-std::vector<type> generate_input(size_t count,
-                                 const type& min,
-                                 const type& max,
-                                 const std::vector<type> special_cases,
-                                 typename std::enable_if<
-                                    !is_vector_type<type>::value
-                                    && std::is_floating_point<type>::value
-                                 >::type* = 0)
-{
-    // Fatal error
-    if(std::fpclassify(max) == FP_SUBNORMAL || std::fpclassify(min) == FP_SUBNORMAL)
-    {
-        log_error("ERROR: min and max value for input generation CAN NOT BE subnormal\n");
-    }
-    std::vector<type> input(count);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<type> dis(min, max);
-    for(auto& i : input)
-    {
-        type x = dis(gen);
-        while(std::fpclassify(x) == FP_SUBNORMAL)
-        {
-            x = dis(gen);
-        }
-        i = x;
-    }
-
-    input.insert(input.begin(), special_cases.begin(), special_cases.end());
-    input.resize(count);
-    return input;
-}
-
-template <class type>
-std::vector<type> generate_output(size_t count,
-                                  typename scalar_type<type>::type svalue = typename scalar_type<type>::type(0),
-                                  typename std::enable_if<is_vector_type<type>::value>::type* = 0)
-{
-    type value;
-    for(size_t i = 0; i < vector_size<type>::value; i++)
-        value.s[i] = svalue;
-    return std::vector<type>(count, value);
-}
-
-template <class type>
-std::vector<type> generate_output(size_t count,
-                                  type svalue = type(0),
-                                  typename std::enable_if<!is_vector_type<type>::value>::type* = 0)
-{
-    return std::vector<type>(count, svalue);
-}
-
-template<class T, class K>
-void prepare_special_cases(std::vector<T>& in1_spec_cases, std::vector<K>& in2_spec_cases)
-{
-    if(in1_spec_cases.empty() || in2_spec_cases.empty())
-    {
-        return;
-    }
-
-    size_t new_size = in1_spec_cases.size() * in2_spec_cases.size();
-    std::vector<T> new_in1(new_size);
-    std::vector<K> new_in2(new_size);
-    for(size_t i = 0; i < in1_spec_cases.size(); i++)
-    {
-        for(size_t j = 0; j < in2_spec_cases.size(); j++)
-        {
-            new_in1[(i * in2_spec_cases.size()) + j] = in1_spec_cases[i];
-            new_in2[(i * in2_spec_cases.size()) + j] = in2_spec_cases[j];
-        }
-    }
-    in1_spec_cases = new_in1;
-    in2_spec_cases = new_in2;
-}
-
-template<class T, class K, class M>
-void prepare_special_cases(std::vector<T>& in1_spec_cases,
-                           std::vector<K>& in2_spec_cases,
-                           std::vector<M>& in3_spec_cases)
-{
-    if(in3_spec_cases.empty())
-    {
-        return prepare_special_cases(in1_spec_cases, in2_spec_cases);
-    }
-    else if (in2_spec_cases.empty())
-    {
-        return prepare_special_cases(in1_spec_cases, in3_spec_cases);
-    }
-    else if (in1_spec_cases.empty())
-    {
-        return prepare_special_cases(in2_spec_cases, in3_spec_cases);
-    }
-
-    size_t new_size = in1_spec_cases.size() * in2_spec_cases.size() * in3_spec_cases.size();
-    std::vector<T> new_in1(new_size);
-    std::vector<K> new_in2(new_size);
-    std::vector<M> new_in3(new_size);
-    for(size_t i = 0; i < in1_spec_cases.size(); i++)
-    {
-        for(size_t j = 0; j < in2_spec_cases.size(); j++)
-        {
-            for(size_t k = 0; k < in3_spec_cases.size(); k++)
-            {
-                size_t idx =
-                    (i * in2_spec_cases.size() * in3_spec_cases.size())
-                    + (j * in3_spec_cases.size())
-                    + k;
-                new_in1[idx] = in1_spec_cases[i];
-                new_in2[idx] = in2_spec_cases[j];
-                new_in3[idx] = in3_spec_cases[k];
-            }
-        }
-    }
-    in1_spec_cases = new_in1;
-    in2_spec_cases = new_in2;
-    in3_spec_cases = new_in3;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_GENERATE_INPUTS_HPP
diff --git a/test_conformance/clcpp/utils_test/ternary.hpp b/test_conformance/clcpp/utils_test/ternary.hpp
deleted file mode 100644
index 2a6f6b551a..0000000000
--- a/test_conformance/clcpp/utils_test/ternary.hpp
+++ /dev/null
@@ -1,364 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_TEST_TERNARY_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_TEST_TERNARY_HPP
-
-#include <type_traits>
-#include <algorithm>
-#include <string>
-#include <cmath>
-
-#include "../common.hpp"
-
-#include "detail/base_func_type.hpp"
-#include "generate_inputs.hpp"
-#include "compare.hpp"
-
-template<class IN1, class IN2, class IN3, class OUT1>
-struct ternary_func : public detail::base_func_type<OUT1>
-{
-    typedef IN1 in1_type;
-    typedef IN2 in2_type;
-    typedef IN3 in3_type;
-    typedef OUT1 out_type;
-
-    virtual ~ternary_func() {};
-    virtual std::string str() = 0;
-
-    std::string decl_str()
-    {
-        return type_name<OUT1>() + "(" + type_name<IN1>() + ", " + type_name<IN2>()+  ", " + type_name<IN3>() + ")";
-    }
-
-    bool is_in1_bool()
-    {
-        return false;
-    }
-
-    bool is_in2_bool()
-    {
-        return false;
-    }
-
-    bool is_in3_bool()
-    {
-        return false;
-    }
-
-    IN1 min1()
-    {
-        return detail::get_min<IN1>();
-    }
-
-    IN1 max1()
-    {
-        return detail::get_max<IN1>();
-    }
-
-    IN2 min2()
-    {
-        return detail::get_min<IN2>();
-    }
-
-    IN2 max2()
-    {
-        return detail::get_max<IN2>();
-    }
-
-    IN3 min3()
-    {
-        return detail::get_min<IN3>();
-    }
-
-    IN3 max3()
-    {
-        return detail::get_max<IN3>();
-    }
-
-    std::vector<IN1> in1_special_cases()
-    {
-        return { };
-    }
-
-    std::vector<IN2> in2_special_cases()
-    {
-        return { };
-    }
-
-    std::vector<IN3> in3_special_cases()
-    {
-        return { };
-    }
-
-    template<class T>
-    typename make_vector_type<cl_double, vector_size<T>::value>::type
-    delta(const IN1& in1, const IN2& in2, const IN3& in3, const T& expected)
-    {
-        typedef
-            typename make_vector_type<cl_double, vector_size<T>::value>::type
-            delta_vector_type;
-        // Take care of unused variable warning
-        (void) in1;
-        (void) in2;
-        (void) in3;
-        auto e = detail::make_value<delta_vector_type>(1e-3);
-        return detail::multiply<delta_vector_type>(e, expected);
-    }
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class in1_type, class in2_type, class in3_type, class out_type>
-std::string generate_kernel_ternary(func_type func)
-{
-    std::string in1_value = "input1[gid]";
-    if(func.is_in1_bool())
-    {
-        std::string i = vector_size<in1_type>::value == 1 ? "" : std::to_string(vector_size<in1_type>::value);
-        in1_value = "(input1[gid] != (int" + i + ")(0))";
-    }
-    std::string in2_value = "input2[gid]";
-    if(func.is_in2_bool())
-    {
-        std::string i = vector_size<in2_type>::value == 1 ? "" : std::to_string(vector_size<in2_type>::value);
-        in2_value = "(input2[gid] != (int" + i + ")(0))";
-    }
-    std::string in3_value = "input3[gid]";
-    if(func.is_in3_bool())
-    {
-        std::string i = vector_size<in3_type>::value == 1 ? "" : std::to_string(vector_size<in3_type>::value);
-        in3_value = "(input3[gid] != (int" + i + ")(0))";
-    }
-    std::string function_call = func.str() + "(" + in1_value + ", " + in2_value + ", " + in3_value + ")";
-    if(func.is_out_bool())
-    {
-        std::string i = vector_size<out_type>::value == 1 ? "" : std::to_string(vector_size<out_type>::value);
-        function_call = "convert_int" + i + "(" + func.str() + "(" + in1_value + ", " + in2_value + ", " + in3_value + "))";
-    }
-    return
-        "__kernel void " + func.get_kernel_name() + "(global " + type_name<in1_type>() + " *input1,\n"
-        "                                      global " + type_name<in2_type>() + " *input2,\n"
-        "                                      global " + type_name<in3_type>() + " *input3,\n"
-        "                                      global " + type_name<out_type>() + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#else
-template <class func_type, class in1_type, class in2_type, class in3_type, class out_type>
-std::string generate_kernel_ternary(func_type func)
-{
-    std::string headers = func.headers();
-    std::string in1_value = "input1[gid]";
-    if(func.is_in1_bool())
-    {
-        std::string i = vector_size<in1_type>::value == 1 ? "" : std::to_string(vector_size<in1_type>::value);
-        in1_value = "(input1[gid] != (int" + i + ")(0))";
-    }
-    std::string in2_value = "input2[gid]";
-    if(func.is_in2_bool())
-    {
-        std::string i = vector_size<in2_type>::value == 1 ? "" : std::to_string(vector_size<in2_type>::value);
-        in2_value = "(input2[gid] != (int" + i + ")(0))";
-    }
-    std::string in3_value = "input3[gid]";
-    if(func.is_in3_bool())
-    {
-        std::string i = vector_size<in3_type>::value == 1 ? "" : std::to_string(vector_size<in3_type>::value);
-        in3_value = "(input3[gid] != (int" + i + ")(0))";
-    }
-    std::string function_call = func.str() + "(" + in1_value + ", " + in2_value + ", " + in3_value + ")";
-    if(func.is_out_bool())
-    {
-        std::string i = vector_size<out_type>::value == 1 ? "" : std::to_string(vector_size<out_type>::value);
-        function_call = "convert_cast<int" + i + ">(" + func.str() + "(" + in1_value + ", " + in2_value + ", " + in3_value + "))";
-    }
-    if(func.is_out_bool() || func.is_in1_bool() || func.is_in2_bool() || func.is_in3_bool())
-    {
-        if(headers.find("#include <opencl_convert>") == std::string::npos)
-        {
-            headers += "#include <opencl_convert>\n";
-        }
-    }
-    return
-        "" + func.defs() +
-        "" + headers +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void " + func.get_kernel_name() + "(global_ptr<" + type_name<in1_type>() + "[]> input1,\n"
-        "                                      global_ptr<" + type_name<in2_type>() + "[]> input2,\n"
-        "                                      global_ptr<" + type_name<in3_type>() + "[]> input3,\n"
-        "                                      global_ptr<" + type_name<out_type>() + "[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#endif
-
-template<class INPUT1, class INPUT2, class INPUT3, class OUTPUT, class ternary_op>
-bool verify_ternary(const std::vector<INPUT1> &in1,
-                    const std::vector<INPUT2> &in2,
-                    const std::vector<INPUT3> &in3,
-                    const std::vector<OUTPUT> &out,
-                    ternary_op op)
-{
-    for(size_t i = 0; i < in1.size(); i++)
-    {
-        auto expected = op(in1[i], in2[i], in3[i]);
-        if(!are_equal(expected, out[i], op.delta(in1[i], in2[i], in3[i], expected), op))
-        {
-            print_error_msg(expected, out[i], i, op);
-            return false;
-        }
-    }
-    return true;
-}
-
-template <class ternary_op>
-int test_ternary_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, ternary_op op)
-{
-    cl_mem buffers[4];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int err;
-
-    typedef typename ternary_op::in1_type INPUT1;
-    typedef typename ternary_op::in2_type INPUT2;
-    typedef typename ternary_op::in3_type INPUT3;
-    typedef typename ternary_op::out_type OUTPUT;
-
-    // Don't run test for unsupported types
-    if(!(type_supported<INPUT1>(device)
-         && type_supported<INPUT2>(device)
-         && type_supported<INPUT3>(device)
-         && type_supported<OUTPUT>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_ternary<ternary_op, INPUT1, INPUT2, INPUT3, OUTPUT>(op);
-    std::string kernel_name = op.get_kernel_name();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-#endif
-
-    std::vector<INPUT1> in1_spec_cases = op.in1_special_cases();
-    std::vector<INPUT2> in2_spec_cases = op.in2_special_cases();
-    std::vector<INPUT3> in3_spec_cases = op.in3_special_cases();
-    prepare_special_cases(in1_spec_cases, in2_spec_cases, in3_spec_cases);
-    std::vector<INPUT1> input1 = generate_input<INPUT1>(count, op.min1(), op.max1(), in1_spec_cases);
-    std::vector<INPUT2> input2 = generate_input<INPUT2>(count, op.min2(), op.max2(), in2_spec_cases);
-    std::vector<INPUT3> input3 = generate_input<INPUT3>(count, op.min3(), op.max3(), in3_spec_cases);
-    std::vector<OUTPUT> output = generate_output<OUTPUT>(count);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT1) * input1.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT2) * input2.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    buffers[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT3) * input3.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    buffers[3] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(OUTPUT) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(INPUT1) * input1.size(),
-        static_cast<void *>(input1.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(INPUT2) * input2.size(),
-        static_cast<void *>(input2.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[2], CL_TRUE, 0, sizeof(INPUT3) * input3.size(),
-        static_cast<void *>(input3.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    err |= clSetKernelArg(kernel, 2, sizeof(buffers[2]), &buffers[2]);
-    err |= clSetKernelArg(kernel, 3, sizeof(buffers[3]), &buffers[3]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    work_size[0] = count;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[3], CL_TRUE, 0, sizeof(OUTPUT) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (!verify_ternary(input1, input2, input3, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1,
-            "test_%s %s(%s, %s, %s) failed", op.str().c_str(),
-            type_name<OUTPUT>().c_str(),
-            type_name<INPUT1>().c_str(),
-            type_name<INPUT2>().c_str(),
-            type_name<INPUT3>().c_str()
-        );
-    }
-    log_info(
-        "test_%s %s(%s, %s, %s) passed\n", op.str().c_str(),
-        type_name<OUTPUT>().c_str(),
-        type_name<INPUT1>().c_str(),
-        type_name<INPUT2>().c_str(),
-        type_name<INPUT3>().c_str()
-    );
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseMemObject(buffers[2]);
-    clReleaseMemObject(buffers[3]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_TERNARY_HPP
diff --git a/test_conformance/clcpp/utils_test/unary.hpp b/test_conformance/clcpp/utils_test/unary.hpp
deleted file mode 100644
index 456ad3f02f..0000000000
--- a/test_conformance/clcpp/utils_test/unary.hpp
+++ /dev/null
@@ -1,259 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_UTILS_TEST_UNARY_HPP
-#define TEST_CONFORMANCE_CLCPP_UTILS_TEST_UNARY_HPP
-
-#include <type_traits>
-#include <algorithm>
-#include <string>
-#include <cmath>
-
-#include "../common.hpp"
-
-#include "detail/base_func_type.hpp"
-#include "generate_inputs.hpp"
-#include "compare.hpp"
-
-template<class IN1, class OUT1>
-struct unary_func : public detail::base_func_type<OUT1>
-{
-    typedef IN1 in_type;
-    typedef OUT1 out_type;
-
-    virtual ~unary_func() {};
-    virtual std::string str() = 0;
-
-    // Return string with function type, for example: int(float).
-    std::string decl_str()
-    {
-        return type_name<OUT1>() + "(" + type_name<IN1>() + ")";
-    }
-
-    // Return true if IN1 type in OpenCL kernel should be treated
-    // as bool type; false otherwise.
-    bool is_in1_bool()
-    {
-        return false;
-    }
-
-    // Return min value that can be used as a first argument.
-    IN1 min1()
-    {
-        return detail::get_min<IN1>();
-    }
-
-    // Return max value that can be used as a first argument.
-    IN1 max1()
-    {
-        return detail::get_max<IN1>();
-    }
-
-    // This returns a list of special cases input values we want to
-    // test.
-    std::vector<IN1> in_special_cases()
-    {
-        return { };
-    }
-
-    // Max error. Error should be raised if
-    // abs(result - expected) > delta(.., expected)
-    //
-    // Default value: 0.001 * expected
-    //
-    // (This effects how are_equal() function works,
-    // it may not have effect if verify() method in derived
-    // class does not use are_equal() function.)
-    //
-    // Only for FP numbers/vectors
-    template<class T>
-    typename make_vector_type<cl_double, vector_size<T>::value>::type
-    delta(const IN1& in1, const T& expected)
-    {
-        typedef
-            typename make_vector_type<cl_double, vector_size<T>::value>::type
-            delta_vector_type;
-        // Take care of unused variable warning
-        (void) in1;
-        auto e = detail::make_value<delta_vector_type>(1e-3);
-        return detail::multiply<delta_vector_type>(e, expected);
-    }
-};
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class in_type, class out_type>
-std::string generate_kernel_unary(func_type func)
-{
-    std::string in1_value = "input[gid]";
-    // Convert uintN to boolN values
-    if(func.is_in1_bool())
-    {
-        std::string i = vector_size<in_type>::value == 1 ? "" : std::to_string(vector_size<in_type>::value);
-        in1_value = "(input[gid] != (int" + i + ")(0))";
-    }
-    std::string function_call = func.str() + "(" + in1_value + ");";
-    // Convert boolN result of funtion func_type to uintN
-    if(func.is_out_bool())
-    {
-        std::string i = vector_size<out_type>::value == 1 ? "" : std::to_string(vector_size<out_type>::value);
-        function_call = "convert_int" + i + "(" + func.str() + "(" + in1_value + "))";
-    }
-    return
-        "__kernel void " + func.get_kernel_name() + "(global " + type_name<in_type>() + " *input, global " + type_name<out_type>() + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#else
-template <class func_type, class in_type, class out_type>
-std::string generate_kernel_unary(func_type func)
-{
-    std::string headers = func.headers();
-    std::string in1_value = "input[gid]";
-    if(func.is_in1_bool())
-    {
-        std::string i = vector_size<in_type>::value == 1 ? "" : std::to_string(vector_size<in_type>::value);
-        in1_value = "(input[gid] != (int" + i + ")(0))";
-    }
-    std::string function_call = func.str() + "(" + in1_value + ")";
-    if(func.is_out_bool())
-    {
-        std::string i = vector_size<out_type>::value == 1 ? "" : std::to_string(vector_size<out_type>::value);
-        function_call = "convert_cast<int" + i + ">(" + func.str() + "(" + in1_value + "))";
-    }
-    if(func.is_out_bool() || func.is_in1_bool())
-    {
-        if(headers.find("#include <opencl_convert>") == std::string::npos)
-        {
-            headers += "#include <opencl_convert>\n";
-        }
-    }
-    return
-        "" + func.defs() +
-        "" + headers +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void " + func.get_kernel_name() + "(global_ptr<" + type_name<in_type>() +  "[]> input,"
-                                              "global_ptr<" + type_name<out_type>() + "[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + function_call + ";\n"
-        "}\n";
-}
-#endif
-
-template<class INPUT, class OUTPUT, class unary_op>
-bool verify_unary(const std::vector<INPUT> &in, const std::vector<OUTPUT> &out, unary_op op)
-{
-    for(size_t i = 0; i < in.size(); i++)
-    {
-        auto expected = op(in[i]);
-        if(!are_equal(expected, out[i], op.delta(in[i], expected), op))
-        {
-            print_error_msg(expected, out[i], i, op);
-            return false;
-        }
-    }
-    return true;
-}
-
-template <class unary_op>
-int test_unary_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, unary_op op)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int err;
-
-    typedef typename unary_op::in_type INPUT;
-    typedef typename unary_op::out_type OUTPUT;
-
-    // Don't run test for unsupported types
-    if(!(type_supported<INPUT>(device) && type_supported<OUTPUT>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_unary<unary_op, INPUT, OUTPUT>(op);
-    std::string kernel_name = op.get_kernel_name();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-#endif
-
-    std::vector<INPUT> input = generate_input<INPUT>(count, op.min1(), op.max1(), op.in_special_cases());
-    std::vector<OUTPUT> output = generate_output<OUTPUT>(count);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(OUTPUT) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer")
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(INPUT) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    work_size[0] = count;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(OUTPUT) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (!verify_unary(input, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1, "test_%s %s(%s) failed", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-    }
-    log_info("test_%s %s(%s) passed\n", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_UTILS_TEST_UNARY_HPP
diff --git a/test_conformance/clcpp/vload_vstore/CMakeLists.txt b/test_conformance/clcpp/vload_vstore/CMakeLists.txt
deleted file mode 100644
index c66cb6f75c..0000000000
--- a/test_conformance/clcpp/vload_vstore/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_VLOAD_VSTORE_FUNCS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/vload_vstore/common.hpp b/test_conformance/clcpp/vload_vstore/common.hpp
deleted file mode 100644
index d78d765411..0000000000
--- a/test_conformance/clcpp/vload_vstore/common.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMMON_HPP
-
-#include <type_traits>
-#include <cmath>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include "half_utils.hpp"
-#include <CL/cl_half.h>
-
-// Generates cl_half input
-std::vector<cl_half> generate_half_input(size_t count,
-                                         const cl_float& min,
-                                         const cl_float& max,
-                                         const std::vector<cl_half> special_cases)
-{
-    std::vector<cl_half> input(count);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<cl_float> dis(min, max);
-    for(auto& i : input)
-    {
-        i = cl_half_from_float(dis(gen), CL_HALF_RTE);
-    }
-
-    input.insert(input.begin(), special_cases.begin(), special_cases.end());
-    input.resize(count);
-    return input;
-}
-
-// Generates input for vload_vstore tests, we can't just simply use function
-// generate_input<type>(...), because cl_half is typedef of cl_short (but generating
-// cl_shorts and generating cl_halfs are different operations).
-template <class type>
-std::vector<type> vload_vstore_generate_input(size_t count,
-                                              const type& min,
-                                              const type& max, 
-                                              const std::vector<type> special_cases,
-                                              const bool generate_half,
-                                              typename std::enable_if<
-                                                  std::is_same<type, cl_half>::value
-                                              >::type* = 0)
-{
-    if(!generate_half)
-    {
-        return generate_input<type>(count, min, max, special_cases);
-    }
-    return generate_half_input(count, -(CL_HALF_MAX/4.f), (CL_HALF_MAX/4.f), special_cases);
-}
-
-// If !std::is_same<type, cl_half>::value, we can just use generate_input<type>(...).
-template <class type>
-std::vector<type> vload_vstore_generate_input(size_t count,
-                                              const type& min,
-                                              const type& max, 
-                                              const std::vector<type> special_cases,
-                                              const bool generate_half,
-                                              typename std::enable_if<
-                                                  !std::is_same<type, cl_half>::value
-                                              >::type* = 0)
-{
-    return generate_input<type>(count, min, max, special_cases);
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_RELATIONAL_FUNCS_COMMON_HPP
diff --git a/test_conformance/clcpp/vload_vstore/half_utils.hpp b/test_conformance/clcpp/vload_vstore/half_utils.hpp
deleted file mode 100644
index ce7ae822ce..0000000000
--- a/test_conformance/clcpp/vload_vstore/half_utils.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_HALF_UTILS_HPP
-#define TEST_CONFORMANCE_CLCPP_HALF_UTILS_HPP
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include <cmath>
-
-namespace detail 
-{
-
-template<class INT_TYPE>
-inline int clz(INT_TYPE x)
-{
-    int count = 0;
-    if(std::is_unsigned<INT_TYPE>::value)
-    {
-        cl_ulong value = x;
-        value <<= 8 * sizeof(value) - (8 * sizeof(x));
-        for(count = 0; 0 == (value & (CL_LONG_MIN)); count++)
-        {
-            value <<= 1;
-        }
-    }
-    else
-    {            
-        cl_long value = x;
-        value <<= 8 * sizeof(value) - (8 * sizeof(x));
-        for(count = 0; 0 == (value & (CL_LONG_MIN)); count++)
-        {
-            value <<= 1;
-        }
-    }
-    return count;
-}
-
-} // namespace detail 
-
-#endif // TEST_CONFORMANCE_CLCPP_HALF_UTILS_HPP
diff --git a/test_conformance/clcpp/vload_vstore/main.cpp b/test_conformance/clcpp/vload_vstore/main.cpp
deleted file mode 100644
index e5c4fdd062..0000000000
--- a/test_conformance/clcpp/vload_vstore/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "vload_funcs.hpp"
-#include "vstore_funcs.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/vload_vstore/vload_funcs.hpp b/test_conformance/clcpp/vload_vstore/vload_funcs.hpp
deleted file mode 100644
index cb9415e0b8..0000000000
--- a/test_conformance/clcpp/vload_vstore/vload_funcs.hpp
+++ /dev/null
@@ -1,367 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_VLOAD_VSTORE_FUNCS_VLOAD_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_VLOAD_VSTORE_FUNCS_VLOAD_FUNCS_HPP
-
-#include <iterator>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include "common.hpp"
-
-#include <CL/cl_half.h>
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class in_type, class out_type, size_t N>
-std::string generate_kernel_vload(func_type func)
-{
-    std::string input1_type_str = type_name<in_type>();
-    if(func.is_in1_half())
-    {
-        input1_type_str = "half";
-    }
-    std::string output1_type_str = type_name<out_type>();
-    if(N == 3)
-    {
-        output1_type_str[output1_type_str.size() - 1] = '3';
-    }
-    return
-        "__kernel void test_" + func.str() + "(global " + input1_type_str + " *input, global " + output1_type_str + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + func.str() + std::to_string(N) + "(gid, input);\n"
-        "}\n";
-}
-#else
-template <class func_type, class in_type, class out_type, size_t N>
-std::string generate_kernel_vload(func_type func)
-{
-    std::string input1_type_str = type_name<in_type>();
-    if(func.is_in1_half())
-    {
-        input1_type_str = "half";
-    }
-    std::string output1_type_str = type_name<out_type>();
-    if(N == 3)
-    {
-        output1_type_str[output1_type_str.size() - 1] = '3';
-    }
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_" + func.str() + "(global_ptr<" + input1_type_str +  "[]> input,"
-                                              "global_ptr<" + output1_type_str + "[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    output[gid] = " + func.str() + "<" + std::to_string(N) + ">(gid, input.get());\n"
-        "}\n";
-}
-#endif
-
-template<class INPUT, class OUTPUT, class vload_op>
-bool verify_vload(const std::vector<INPUT> &in, const std::vector<OUTPUT> &out, vload_op op)
-{
-    for(size_t i = 0; i < out.size(); i++)
-    {
-        auto expected = op(i, in.begin());
-        for(size_t j = 0; j < vload_op::vector_size; j++)
-        {
-            size_t idx = (i * vector_size<OUTPUT>::value) + j;
-            if(!are_equal(expected.s[j], out[i].s[j], op.delta(in[idx], expected.s[j]), op))
-            {
-                print_error_msg(expected, out[i], i, op);
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-template <class vload_op>
-int test_vload_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, vload_op op)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int err;
-
-    typedef typename vload_op::in_type INPUT;
-    typedef typename vload_op::out_type OUTPUT;
-
-    // Don't run test for unsupported types
-    if(!(type_supported<INPUT>(device) && type_supported<OUTPUT>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_vload<vload_op, INPUT, OUTPUT, vload_op::vector_size>(op);
-    std::string kernel_name("test_"); kernel_name += op.str();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-#endif
-
-    std::vector<INPUT> input = vload_vstore_generate_input<INPUT>(
-        count * vector_size<OUTPUT>::value, op.min1(), op.max1(), op.in_special_cases(), op.is_in1_half()
-    );
-    std::vector<OUTPUT> output = generate_output<OUTPUT>(count);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(OUTPUT) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(INPUT) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    work_size[0] = count;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(OUTPUT) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (!verify_vload(input, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1, "test_%s %s(%s) failed",
-            op.str().c_str(),
-            type_name<OUTPUT>().c_str(),
-            type_name<INPUT>().c_str()
-        );
-    }
-    log_info("test_%s %s(%s) passed\n", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-template <class IN1, cl_int N /* Vector size */>
-struct vload_func : public unary_func<
-                        IN1,
-                        typename make_vector_type<IN1, N>::type /* create IN1N type */
-                    >
-{
-    typedef typename make_vector_type<IN1, N>::type result_type;
-    const static size_t vector_size = N;
-
-    std::string str()
-    {
-        return "vload";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_vector_load_store>\n";
-    }
-
-    template<class Iterator>
-    result_type operator()(const size_t offset, Iterator x)
-    {
-        static_assert(
-            !is_vector_type<IN1>::value,
-            "IN1 must be scalar type"
-        );
-        static_assert(
-            std::is_same<typename std::iterator_traits<Iterator>::value_type, IN1>::value,
-            "std::iterator_traits<Iterator>::value_type must be IN1"
-        );
-
-        typedef typename std::iterator_traits<Iterator>::difference_type diff_type;
-
-        result_type r;
-        Iterator temp = x + static_cast<diff_type>(offset * N);
-        for(size_t i = 0; i < N; i++)
-        {
-            r.s[i] = *temp;
-            temp++;
-        }
-        return r;
-    }
-
-    bool is_in1_half()
-    {
-        return false;
-    }
-};
-
-template <cl_int N /* Vector size */>
-struct vload_half_func : public unary_func<
-                            cl_half,
-                            typename make_vector_type<cl_float, N>::type /* create IN1N type */
-                         >
-{
-    typedef typename make_vector_type<cl_float, N>::type result_type;
-    const static size_t vector_size = N;
-
-    std::string str()
-    {
-        return "vload_half";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_vector_load_store>\n";
-    }
-
-    template<class Iterator>
-    result_type operator()(const size_t offset, Iterator x)
-    {
-        static_assert(
-            std::is_same<typename std::iterator_traits<Iterator>::value_type, cl_half>::value,
-            "std::iterator_traits<Iterator>::value_type must be cl_half"
-        );
-
-        typedef typename std::iterator_traits<Iterator>::difference_type diff_type;
-
-        result_type r;
-        Iterator temp = x + static_cast<diff_type>(offset * N);
-        for(size_t i = 0; i < N; i++)
-        {
-            r.s[i] = cl_half_to_float(*temp);
-            temp++;
-        }
-        return r;
-    }
-
-    bool is_in1_half()
-    {
-        return true;
-    }
-};
-
-template <cl_int N /* Vector size */>
-struct vloada_half_func : public unary_func<
-                            cl_half,
-                            typename make_vector_type<cl_float, N>::type /* create IN1N type */
-                         >
-{
-    typedef typename make_vector_type<cl_float, N>::type result_type;
-    const static size_t vector_size = N;
-
-    std::string str()
-    {
-        return "vloada_half";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_vector_load_store>\n";
-    }
-
-    template<class Iterator>
-    result_type operator()(const size_t offset, Iterator x)
-    {
-        static_assert(
-            std::is_same<typename std::iterator_traits<Iterator>::value_type, cl_half>::value,
-            "std::iterator_traits<Iterator>::value_type must be cl_half"
-        );
-
-        typedef typename std::iterator_traits<Iterator>::difference_type diff_type;
-
-        result_type r;
-        size_t alignment = N == 3 ? 4 : N;
-        Iterator temp = x + static_cast<diff_type>(offset * alignment);
-        for(size_t i = 0; i < N; i++)
-        {
-            r.s[i] = cl_half_to_float(*temp);
-            temp++;
-        }
-        return r;
-    }
-
-    bool is_in1_half()
-    {
-        return true;
-    }
-};
-
-AUTO_TEST_CASE(test_vload_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-#define TEST_VLOAD_FUNC_MACRO(CLASS) \
-    last_error = test_vload_func( \
-        device, context, queue, n_elems, CLASS \
-    ); \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-    TEST_VLOAD_FUNC_MACRO((vload_func<cl_uint,  2>()))
-    TEST_VLOAD_FUNC_MACRO((vload_func<cl_float, 4>()))
-    TEST_VLOAD_FUNC_MACRO((vload_func<cl_short, 8>()))
-    TEST_VLOAD_FUNC_MACRO((vload_func<cl_int, 16>()))
-
-    TEST_VLOAD_FUNC_MACRO((vload_half_func<2>()))
-    TEST_VLOAD_FUNC_MACRO((vload_half_func<3>()))
-    TEST_VLOAD_FUNC_MACRO((vload_half_func<4>()))
-    TEST_VLOAD_FUNC_MACRO((vload_half_func<8>()))
-    TEST_VLOAD_FUNC_MACRO((vload_half_func<16>()))
-
-    TEST_VLOAD_FUNC_MACRO((vloada_half_func<2>()))
-    TEST_VLOAD_FUNC_MACRO((vloada_half_func<3>()))
-    TEST_VLOAD_FUNC_MACRO((vloada_half_func<4>()))
-    TEST_VLOAD_FUNC_MACRO((vloada_half_func<8>()))
-    TEST_VLOAD_FUNC_MACRO((vloada_half_func<16>()))
-
-#undef TEST_VLOAD_FUNC_MACRO
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_VLOAD_VSTORE_FUNCS_VLOAD_FUNCS_HPP
diff --git a/test_conformance/clcpp/vload_vstore/vstore_funcs.hpp b/test_conformance/clcpp/vload_vstore/vstore_funcs.hpp
deleted file mode 100644
index 7ffc584ea4..0000000000
--- a/test_conformance/clcpp/vload_vstore/vstore_funcs.hpp
+++ /dev/null
@@ -1,349 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_VLOAD_VSTORE_FUNCS_VSTORE_FUNCS_HPP
-#define TEST_CONFORMANCE_CLCPP_VLOAD_VSTORE_FUNCS_VSTORE_FUNCS_HPP
-
-#include <iterator>
-
-#include "../common.hpp"
-#include "../funcs_test_utils.hpp"
-
-#include "common.hpp"
-
-#include <CL/cl_half.h>
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class func_type, class in_type, class out_type, size_t N>
-std::string generate_kernel_vstore(func_type func)
-{
-    std::string input1_type_str = type_name<in_type>();
-    if(N == 3)
-    {
-        input1_type_str[input1_type_str.size() - 1] = '3';
-    }
-    std::string output1_type_str = type_name<out_type>();
-    if(func.is_out_half())
-    {
-        output1_type_str = "half";
-    }
-    return
-        "__kernel void test_" + func.str() + "(global " + input1_type_str + " *input, global " + output1_type_str + " *output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    " + func.str() + std::to_string(N) + "(input[gid], gid, output);\n"
-        "}\n";
-}
-#else
-template <class func_type, class in_type, class out_type, size_t N>
-std::string generate_kernel_vstore(func_type func)
-{
-    std::string input1_type_str = type_name<in_type>();
-    if(N == 3)
-    {
-        input1_type_str[input1_type_str.size() - 1] = '3';
-    }
-    std::string output1_type_str = type_name<out_type>();
-    if(func.is_out_half())
-    {
-        output1_type_str = "half";
-    }
-    return
-        "" + func.defs() +
-        "" + func.headers() +
-        "#include <opencl_memory>\n"
-        "#include <opencl_work_item>\n"
-        "using namespace cl;\n"
-        "__kernel void test_" + func.str() + "(global_ptr<" + input1_type_str +  "[]> input,"
-                                              "global_ptr<" + output1_type_str + "[]> output)\n"
-        "{\n"
-        "    size_t gid = get_global_id(0);\n"
-        "    " + func.str() + "(input[gid], gid, output.get());\n"
-        "}\n";
-}
-#endif
-
-template<class INPUT, class OUTPUT, class vload_op>
-bool verify_vstore(const std::vector<INPUT> &in, const std::vector<OUTPUT> &out, vload_op op)
-{
-    for(size_t i = 0; i < in.size(); i++)
-    {
-        auto expected = op(in[i]);
-        for(size_t j = 0; j < vload_op::vector_size; j++)
-        {
-            size_t idx = (i * vload_op::vec_alignment) + j;
-            if(!are_equal(expected.s[j], out[idx], op.delta(in[i], expected).s[j], op))
-            {
-                print_error_msg(expected.s[j], out[idx], idx, op);
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-template <class vload_op>
-int test_vstore_func(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, vload_op op)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t work_size[1];
-    int err;
-
-    typedef typename vload_op::in_type INPUT;
-    typedef typename vload_op::out_type OUTPUT;
-
-    // Don't run test for unsupported types
-    if(!(type_supported<INPUT>(device) && type_supported<OUTPUT>(device)))
-    {
-        return CL_SUCCESS;
-    }
-
-    std::string code_str = generate_kernel_vstore<vload_op, INPUT, OUTPUT, vload_op::vector_size>(op);
-    std::string kernel_name("test_"); kernel_name += op.str();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
-    RETURN_ON_ERROR(err)
-#endif
-
-    std::vector<INPUT> input = generate_input<INPUT>(count, op.min1(), op.max1(), op.in_special_cases());
-    std::vector<OUTPUT> output = generate_output<OUTPUT>(count * vector_size<INPUT>::value);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(INPUT) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(OUTPUT) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(INPUT) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    work_size[0] = count;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(OUTPUT) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (!verify_vstore(input, output, op))
-    {
-        RETURN_ON_ERROR_MSG(-1, "test_%s %s(%s) failed", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-    }
-    log_info("test_%s %s(%s) passed\n", op.str().c_str(), type_name<OUTPUT>().c_str(), type_name<INPUT>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-template <class T, cl_int N /* Vector size */>
-struct vstore_func : public unary_func<
-                        typename make_vector_type<T, N>::type,
-                        T
-                     >
-{
-    typedef typename make_vector_type<T, N>::type input1_type;
-    typedef typename make_vector_type<T, N>::type result_type;
-    const static size_t vector_size = N;
-    const static size_t vec_alignment = N;
-
-    std::string str()
-    {
-        return "vstore";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_vector_load_store>\n";
-    }
-
-    result_type operator()(const input1_type& in)
-    {
-        static_assert(
-            !is_vector_type<T>::value,
-            "T must be scalar type"
-        );
-        return in;
-    }
-
-    bool is_out_half()
-    {
-        return false;
-    }
-};
-
-template <cl_int N /* Vector size */>
-struct vstore_half_func : public unary_func<
-                            typename make_vector_type<cl_float, N>::type,
-                            cl_half
-                          >
-{
-    typedef typename make_vector_type<cl_float, N>::type input1_type;
-    typedef typename make_vector_type<cl_half, N>::type result_type;
-    const static size_t vector_size = N;
-    const static size_t vec_alignment = N;
-
-    std::string str()
-    {
-        return "vstore_half";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_vector_load_store>\n";
-    }
-
-    result_type operator()(const input1_type& in)
-    {
-        result_type r;
-        for(size_t i = 0; i < N; i++)
-        {
-            r.s[i] = cl_half_from_float(in.s[i], CL_HALF_RTE);
-        }
-        return r;
-    }
-
-    input1_type min1()
-    {
-        return detail::make_value<input1_type>(-512.f);
-    }
-
-    input1_type max1()
-    {
-        return detail::make_value<input1_type>(512.f);
-    }
-
-    bool is_out_half()
-    {
-        return true;
-    }
-};
-
-template <cl_int N /* Vector size */>
-struct vstorea_half_func : public unary_func<
-                            typename make_vector_type<cl_float, N>::type,
-                            cl_half
-                          >
-{
-    typedef typename make_vector_type<cl_float, N>::type input1_type;
-    typedef typename make_vector_type<cl_half, N>::type result_type;
-    const static size_t vector_size = N;
-    const static size_t vec_alignment = N == 3 ? 4 : N;
-
-    std::string str()
-    {
-        return "vstorea_half";
-    }
-
-    std::string headers()
-    {
-        return "#include <opencl_vector_load_store>\n";
-    }
-
-    result_type operator()(const input1_type& in)
-    {
-        result_type r;
-        for(size_t i = 0; i < N; i++)
-        {
-            r.s[i] = cl_half_from_float(in.s[i], CL_HALF_RTE);
-        }
-        return r;
-    }
-
-    input1_type min1()
-    {
-        return detail::make_value<input1_type>(-512.f);
-    }
-
-    input1_type max1()
-    {
-        return detail::make_value<input1_type>(512.f);
-    }
-
-    bool is_out_half()
-    {
-        return true;
-    }
-};
-
-AUTO_TEST_CASE(test_vstore_funcs)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int last_error = CL_SUCCESS;
-
-#define TEST_VSTORE_FUNC_MACRO(CLASS) \
-    last_error = test_vstore_func( \
-        device, context, queue, n_elems, CLASS \
-    ); \
-    CHECK_ERROR(last_error) \
-    error |= last_error;
-
-    TEST_VSTORE_FUNC_MACRO((vstore_func<cl_uint, 2>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_func<cl_uint, 3>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_func<cl_int, 4>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_func<cl_float, 8>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_func<cl_uchar, 16>()))
-
-    TEST_VSTORE_FUNC_MACRO((vstore_half_func<2>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_half_func<3>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_half_func<4>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_half_func<8>()))
-    TEST_VSTORE_FUNC_MACRO((vstore_half_func<16>()))
-
-    TEST_VSTORE_FUNC_MACRO((vstorea_half_func<2>()))
-    TEST_VSTORE_FUNC_MACRO((vstorea_half_func<3>()))
-
-#undef TEST_VSTORE_FUNC_MACRO
-
-    if(error != CL_SUCCESS)
-    {
-        return -1;
-    }
-    return error;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_VLOAD_VSTORE_FUNCS_VSTORE_FUNCS_HPP
diff --git a/test_conformance/clcpp/workgroups/CMakeLists.txt b/test_conformance/clcpp/workgroups/CMakeLists.txt
deleted file mode 100644
index 812e982e3f..0000000000
--- a/test_conformance/clcpp/workgroups/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_WORKGROUPS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/workgroups/common.hpp b/test_conformance/clcpp/workgroups/common.hpp
deleted file mode 100644
index ab7b100d9f..0000000000
--- a/test_conformance/clcpp/workgroups/common.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WG_COMMON_HPP
-#define TEST_CONFORMANCE_CLCPP_WG_COMMON_HPP
-
-#include <string>
-#include <vector>
-#include <limits>
-
-enum class work_group_op : int {
-    add, min, max    
-};
-
-std::string to_string(work_group_op op)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return "add";
-        case work_group_op::min:
-            return "min";
-        case work_group_op::max:
-            return "max";
-        default:
-            break;
-    }
-    return "";
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-std::vector<CL_INT_TYPE> generate_input(size_t count, size_t wg_size)
-{
-    std::vector<CL_INT_TYPE> input(count, CL_INT_TYPE(1));      
-    switch (op)
-    {
-        case work_group_op::add:
-            return input;
-        case work_group_op::min:
-            {                
-                size_t j = wg_size;
-                for(size_t i = 0; i < count; i++)
-                {                
-                    input[i] = static_cast<CL_INT_TYPE>(j);
-                    j--;
-                    if(j == 0)
-                    {
-                        j = wg_size;
-                    }
-                }
-            }
-            break;         
-        case work_group_op::max:          
-            {                
-                size_t j = 0;
-                for(size_t i = 0; i < count; i++)
-                {                
-                    input[i] = static_cast<CL_INT_TYPE>(j);
-                    j++;
-                    if(j == wg_size)
-                    {
-                        j = 0;
-                    }
-                }
-            }
-    }
-    return input;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-std::vector<CL_INT_TYPE> generate_output(size_t count, size_t wg_size)
-{       
-    switch (op)
-    {
-        case work_group_op::add:
-            return std::vector<CL_INT_TYPE>(count, CL_INT_TYPE(0));
-        case work_group_op::min:  
-            return std::vector<CL_INT_TYPE>(count, (std::numeric_limits<CL_INT_TYPE>::max)());       
-        case work_group_op::max:          
-            return std::vector<CL_INT_TYPE>(count, (std::numeric_limits<CL_INT_TYPE>::min)());
-    }
-    return std::vector<CL_INT_TYPE>(count, CL_INT_TYPE(0));
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_WG_COMMON_HPP
diff --git a/test_conformance/clcpp/workgroups/main.cpp b/test_conformance/clcpp/workgroups/main.cpp
deleted file mode 100644
index 924bb44c3a..0000000000
--- a/test_conformance/clcpp/workgroups/main.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_wg_all.hpp"
-#include "test_wg_any.hpp"
-#include "test_wg_broadcast.hpp"
-#include "test_wg_reduce.hpp"
-#include "test_wg_scan_inclusive.hpp"
-#include "test_wg_scan_exclusive.hpp"
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/workgroups/test_wg_all.hpp b/test_conformance/clcpp/workgroups/test_wg_all.hpp
deleted file mode 100644
index 35ee521710..0000000000
--- a/test_conformance/clcpp/workgroups/test_wg_all.hpp
+++ /dev/null
@@ -1,220 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_ALL_HPP
-#define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_ALL_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of work-group functions
-#include "common.hpp"
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_wg_all_kernel_code()
-{
-    return
-        "__kernel void test_wg_all(global uint *input, global uint *output)\n"
-        "{\n"
-        "    ulong tid = get_global_id(0);\n"
-        "\n"
-        "    int result = work_group_all(input[tid] < input[tid+1]);\n"
-        "    if(result == 0) {\n        output[tid] = 0;\n        return;\n    }\n"
-        "    output[tid] = 1;\n"
-        "}\n";
-}
-#else
-std::string generate_wg_all_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_all(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    bool result = work_group_all(input[tid] < input[tid+1]);\n"
-           "    if(!result) {\n        output[tid] = 0;\n        return;\n    }\n"
-           "    output[tid] = 1;\n"
-           "}\n";
-}
-#endif
-
-int verify_wg_all(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t count, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < count; i += wg_size)
-    {
-        // Work-group all
-        bool all = true;
-        for (j = 0; j < ((count - i) > wg_size ? wg_size : (count - i)); j++)
-        {
-            if(!(in[i+j] < in[i+j+1]))
-            {
-                all = false;
-                break;
-            }
-        }
-
-        // Convert bool to uint
-        cl_uint all_uint = all ? 1 : 0;
-        // Check if all work-items in work-group stored correct value
-        for (j = 0; j < ((count - i) > wg_size ? wg_size : (count - i)); j++)
-        {
-            if (all_uint != out[i + j])
-            {
-                log_info(
-                    "work_group_all %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<cl_uint>().c_str(),
-                    i + j,
-                    static_cast<size_t>(all_uint),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-std::vector<cl_uint> generate_input_wg_all(size_t count, size_t wg_size)
-{
-    std::vector<cl_uint> input(count, cl_uint(0));
-    size_t j = wg_size;
-    for(size_t i = 0; i < count; i++)
-    {
-        input[i] = static_cast<cl_uint>(i);
-        // In one place in ~half of workgroups input[tid] < input[tid+1] will
-        // generate false, that means for that workgroups work_group_all()
-        // should return false
-        if((j == wg_size/2) && (i > count/2))
-        {
-            input[i] = input[i - 1];
-        }
-        j--;
-        if(j == 0)
-        {
-            j = wg_size;
-        }
-    }
-    return input;
-}
-
-std::vector<cl_uint> generate_output_wg_all(size_t count, size_t wg_size)
-{
-    (void) wg_size;
-    return std::vector<cl_uint>(count, cl_uint(1));
-}
-
-int work_group_all(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_wg_all_kernel_code();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_all");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_all", "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_all");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<cl_uint> input = generate_input_wg_all(flat_work_size + 1, wg_size);
-    std::vector<cl_uint> output = generate_output_wg_all(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_wg_all(input, output, flat_work_size, wg_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "work_group_all failed");
-    }
-    log_info("work_group_all passed\n");
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_work_group_all)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int err = CL_SUCCESS;
-
-    err = work_group_all(device, context, queue, n_elems);
-    CHECK_ERROR(err)
-
-    if(err != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_ALL_HPP
diff --git a/test_conformance/clcpp/workgroups/test_wg_any.hpp b/test_conformance/clcpp/workgroups/test_wg_any.hpp
deleted file mode 100644
index 1ceb1ef685..0000000000
--- a/test_conformance/clcpp/workgroups/test_wg_any.hpp
+++ /dev/null
@@ -1,220 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_ANY_HPP
-#define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_ANY_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of work-group functions
-#include "common.hpp"
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_wg_any_kernel_code()
-{
-    return
-        "__kernel void test_wg_any(global uint *input, global uint *output)\n"
-        "{\n"
-        "    ulong tid = get_global_id(0);\n"
-        "\n"
-        "    int result = work_group_any(input[tid] == input[tid+1]);\n"
-        "    if(result == 0) {\n        output[tid] = 0;\n        return;\n    }\n"
-        "    output[tid] = 1;\n"
-        "}\n";
-}
-#else
-std::string generate_wg_any_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_any(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    bool result = work_group_any(input[tid] == input[tid+1]);\n"
-           "    if(!result) {\n        output[tid] = 0;\n        return;\n    }\n"
-           "    output[tid] = 1;\n"
-           "}\n";
-}
-#endif
-
-int verify_wg_any(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t count, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < count; i += wg_size)
-    {
-        // Work-group any
-        bool any = false;
-        for (j = 0; j < ((count - i) > wg_size ? wg_size : (count - i)); j++)
-        {
-            if(in[i+j] == in[i+j+1])
-            {
-                any = true;
-                break;
-            }
-        }
-
-        // Convert bool to uint
-        cl_uint any_uint = any ? 1 : 0;
-        // Check if all work-items in work-group stored correct value
-        for (j = 0; j < ((count - i) > wg_size ? wg_size : (count - i)); j++)
-        {
-            if (any_uint != out[i + j])
-            {
-                log_info(
-                    "work_group_any %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<cl_uint>().c_str(),
-                    i + j,
-                    static_cast<size_t>(any_uint),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-std::vector<cl_uint> generate_input_wg_any(size_t count, size_t wg_size)
-{
-    std::vector<cl_uint> input(count, cl_uint(0));
-    size_t j = wg_size;
-    for(size_t i = 0; i < count; i++)
-    {
-        input[i] = static_cast<cl_uint>(i);
-        // In one place in ~half of workgroups input[tid] == input[tid+1] will
-        // generate true, that means for that workgroups work_group_any()
-        // should return true
-        if((j == wg_size/2) && (i > count/2))
-        {
-            input[i] = input[i - 1];
-        }
-        j--;
-        if(j == 0)
-        {
-            j = wg_size;
-        }
-    }
-    return input;
-}
-
-std::vector<cl_uint> generate_output_wg_any(size_t count, size_t wg_size)
-{
-    (void) wg_size;
-    return std::vector<cl_uint>(count, cl_uint(1));
-}
-
-int work_group_any(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_wg_any_kernel_code();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_any");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_any", "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_any");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<cl_uint> input = generate_input_wg_any(flat_work_size + 1, wg_size);
-    std::vector<cl_uint> output = generate_output_wg_any(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_wg_any(input, output, flat_work_size, wg_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "work_group_any failed");
-    }
-    log_info("work_group_any passed\n");
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_work_group_any)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int err = CL_SUCCESS;
-
-    err = work_group_any(device, context, queue, n_elems);
-    CHECK_ERROR(err)
-
-    if(err != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_ANY_HPP
diff --git a/test_conformance/clcpp/workgroups/test_wg_broadcast.hpp b/test_conformance/clcpp/workgroups/test_wg_broadcast.hpp
deleted file mode 100644
index 999aef192b..0000000000
--- a/test_conformance/clcpp/workgroups/test_wg_broadcast.hpp
+++ /dev/null
@@ -1,460 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP
-#define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of work-group functions
-#include "common.hpp"
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-std::string generate_wg_broadcast_1D_kernel_code()
-{
-    return
-        "__kernel void test_wg_broadcast(global uint *input, global uint *output)\n"
-        "{\n"
-        "    ulong tid = get_global_id(0);\n"
-        "    uint result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
-        "    output[tid] = result;\n"
-        "}\n";
-}
-std::string generate_wg_broadcast_2D_kernel_code()
-{
-    return
-        "__kernel void test_wg_broadcast(global uint *input, global uint *output)\n"
-        "{\n"
-        "    ulong tid_x = get_global_id(0);\n"
-        "    ulong tid_y = get_global_id(1);\n"
-        "    size_t x = get_group_id(0) % get_local_size(0);\n"
-        "    size_t y = get_group_id(1) % get_local_size(1);\n"
-        "    size_t idx = (tid_y * get_global_size(0)) + tid_x;\n"
-        "    uint result = work_group_broadcast(input[idx], x, y);\n"
-        "    output[idx] = result;\n"
-        "}\n";
-}
-std::string generate_wg_broadcast_3D_kernel_code()
-{
-    return
-        "__kernel void test_wg_broadcast(global uint *input, global uint *output)\n"
-        "{\n"
-        "    ulong tid_x = get_global_id(0);\n"
-        "    ulong tid_y = get_global_id(1);\n"
-        "    ulong tid_z = get_global_id(2);\n"
-        "    size_t x = get_group_id(0) % get_local_size(0);\n"
-        "    size_t y = get_group_id(1) % get_local_size(1);\n"
-        "    size_t z = get_group_id(2) % get_local_size(2);\n"
-        "    ulong idx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
-        "    uint result = work_group_broadcast(input[idx], x, y, z);\n"
-        "    output[idx] = result;\n"
-        "}\n";
-}
-#else
-std::string generate_wg_broadcast_1D_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    uint result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
-           "    output[tid] = result;\n"
-           "}\n";
-}
-std::string generate_wg_broadcast_2D_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-           "{\n"
-           "    ulong tid_x = get_global_id(0);\n"
-           "    ulong tid_y = get_global_id(1);\n"
-           "    size_t x = get_group_id(0) % get_local_size(0);\n"
-           "    size_t y = get_group_id(1) % get_local_size(1);\n"
-           "    size_t idx = (tid_y * get_global_size(0)) + tid_x;\n"
-           "    uint result = work_group_broadcast(input[idx], x, y);\n"
-           "    output[idx] = result;\n"
-           "}\n";
-}
-std::string generate_wg_broadcast_3D_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
-           "{\n"
-           "    ulong tid_x = get_global_id(0);\n"
-           "    ulong tid_y = get_global_id(1);\n"
-           "    ulong tid_z = get_global_id(2);\n"
-           "    size_t x = get_group_id(0) % get_local_size(0);\n"
-           "    size_t y = get_group_id(1) % get_local_size(1);\n"
-           "    size_t z = get_group_id(2) % get_local_size(2);\n"
-           "    ulong idx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
-           "    uint result = work_group_broadcast(input[idx], x, y, z);\n"
-           "    output[idx] = result;\n"
-           "}\n";
-}
-#endif
-
-int
-verify_wg_broadcast_1D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t n, size_t wg_size)
-{
-    size_t i, j;
-    size_t group_id;
-
-    for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
-    {
-        int local_size = (n-i) > wg_size ? wg_size : (n-i);
-        cl_uint broadcast_result = in[i + (group_id % local_size)];
-        for (j=0; j<local_size; j++)
-        {
-            if ( broadcast_result != out[i+j] )
-            {
-                log_info("work_group_broadcast: Error at %lu: expected = %u, got = %u\n", i+j, broadcast_result, out[i+j]);
-                return -1;
-            }
-        }
-    }
-
-    return CL_SUCCESS;
-}
-
-int
-verify_wg_broadcast_2D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out,
-                       size_t nx, size_t ny,
-                       size_t wg_size_x, size_t wg_size_y)
-{
-    size_t i, j, _i, _j;
-    size_t group_id_x, group_id_y;
-
-    for (i=0,group_id_y=0; i<ny; i+=wg_size_y,group_id_y++)
-    {
-        size_t y = group_id_y % wg_size_y;
-        size_t local_size_y = (ny-i) > wg_size_y ? wg_size_y : (ny-i);
-        for (_i=0; _i < local_size_y; _i++)
-        {
-            for (j=0,group_id_x=0; j<nx; j+=wg_size_x,group_id_x++)
-            {
-                size_t x = group_id_x % wg_size_x;
-                size_t local_size_x = (nx-j) > wg_size_x ? wg_size_x : (nx-j);
-                cl_uint broadcast_result = in[(i + y) * nx + (j + x)];
-                for (_j=0; _j < local_size_x; _j++)
-                {
-                    size_t indx = (i + _i) * nx + (j + _j);
-                    if ( broadcast_result != out[indx] )
-                    {
-                        log_info("%lu\n", indx);
-                        log_info("%lu\n", ((i + y) * nx + (j + x)));
-                         log_info("%lu\n", out.size());
-                        log_info("work_group_broadcast: Error at (%lu, %lu): expected = %u, got = %u\n", j+_j, i+_i, broadcast_result, out[indx]);
-                        return -1;
-                    }
-                }
-            }
-        }
-    }
-
-    return CL_SUCCESS;
-}
-
-int
-verify_wg_broadcast_3D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out,
-                       size_t nx, size_t ny, size_t nz,
-                       size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
-{
-    size_t i, j, k, _i, _j, _k;
-    size_t group_id_x, group_id_y, group_id_z;
-
-    for (i=0,group_id_z=0; i<nz; i+=wg_size_z,group_id_z++)
-    {
-        size_t z = group_id_z % wg_size_z;
-        size_t local_size_z = (nz-i) > wg_size_z ? wg_size_z : (nz-i);
-        for (_i=0; _i < local_size_z; _i++)
-        {
-            for (j=0,group_id_y=0; j<ny; j+=wg_size_y,group_id_y++)
-            {
-                size_t y = group_id_y % wg_size_y;
-                size_t local_size_y = (ny-j) > wg_size_y ? wg_size_y : (ny-j);
-                for (_j=0; _j < local_size_y; _j++)
-                {
-                    for (k=0,group_id_x=0; k<nx; k+=wg_size_x,group_id_x++)
-                    {
-                        size_t x = group_id_x % wg_size_x;
-                        size_t local_size_x = (nx-k) > wg_size_x ? wg_size_x : (nx-k);
-                        cl_uint broadcast_result = in[(i + z) * ny * nz + (j + y) * nx + (k + x)];
-                        for (_k=0; _k < local_size_x; _k++)
-                        {
-                            size_t indx = (i + _i) * ny * nx + (j + _j) * nx + (k + _k);
-                            if ( broadcast_result != out[indx] )
-                            {
-                                log_info(
-                                    "work_group_broadcast: Error at (%lu, %lu, %lu): expected = %u, got = %u\n",
-                                    k+_k, j+_j, i+_i,
-                                    broadcast_result, out[indx]);
-                                return -1;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-std::vector<cl_uint> generate_input_wg_broadcast(size_t count, size_t wg_size)
-{
-    std::vector<cl_uint> input(count, cl_uint(0));
-    size_t j = wg_size;
-    for(size_t i = 0; i < count; i++)
-    {
-        input[i] = static_cast<cl_uint>(j);
-        j--;
-        if(j == 0)
-        {
-            j = wg_size;
-        }
-    }
-    return input;
-}
-
-std::vector<cl_uint> generate_output_wg_broadcast(size_t count, size_t wg_size)
-{
-    (void) wg_size;
-    return std::vector<cl_uint>(count, cl_uint(1));
-}
-
-int work_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, size_t dim)
-{
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t flat_wg_size;
-    size_t wg_size[] = { 1, 1, 1};
-    size_t work_size[] = { 1, 1, 1};
-    int err;
-
-    // Get kernel source code
-    std::string code_str;
-    if(dim > 2) code_str = generate_wg_broadcast_3D_kernel_code();
-    else if(dim > 1) code_str = generate_wg_broadcast_2D_kernel_code();
-    else code_str = generate_wg_broadcast_1D_kernel_code();
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast", "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast");
-    RETURN_ON_ERROR(err)
-#endif
-
-    // Get max flat workgroup size
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &flat_wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    // Set local work size
-    wg_size[0] = flat_wg_size;
-    if(dim > 2)
-    {
-        if (flat_wg_size >=512)
-        {
-            wg_size[0] = wg_size[1] = wg_size[2] = 8;
-        }
-        else if (flat_wg_size >= 64)
-        {
-            wg_size[0] = wg_size[1] = wg_size[2] = 4;
-        }
-        else if (flat_wg_size >= 8)
-        {
-            wg_size[0] = wg_size[1] = wg_size[2] = 2;
-        }
-        else
-        {
-            wg_size[0] = wg_size[1] = wg_size[2] = 1;
-        }
-    }
-    else if(dim > 1)
-    {
-        if (flat_wg_size >= 256)
-        {
-            wg_size[0] = wg_size[1] = 16;
-        }
-        else if (flat_wg_size >=64)
-        {
-            wg_size[0] = wg_size[1] = 8;
-        }
-        else if (flat_wg_size >= 16)
-        {
-            wg_size[0] = wg_size[1] = 4;
-        }
-        else
-        {
-            wg_size[0] = wg_size[1] = 1;
-        }
-    }
-
-    // Calculate flat local work size
-    flat_wg_size = wg_size[0];
-    if(dim > 1) flat_wg_size *= wg_size[1];
-    if(dim > 2) flat_wg_size *= wg_size[2];
-
-    // Calculate global work size
-    size_t flat_work_size = count;
-    // 3D
-    if(dim > 2)
-    {
-        size_t wg_number = static_cast<size_t>(
-            std::ceil(static_cast<double>(count / 3) / (wg_size[0] * wg_size[1] * wg_size[2]))
-        );
-        work_size[0] = wg_number * wg_size[0];
-        work_size[1] = wg_number * wg_size[1];
-        work_size[2] = wg_number * wg_size[2];
-        flat_work_size = work_size[0] * work_size[1] * work_size[2];
-    }
-    // 2D
-    else if(dim > 1)
-    {
-        size_t wg_number = static_cast<size_t>(
-            std::ceil(static_cast<double>(count / 2) / (wg_size[0] * wg_size[1]))
-        );
-        work_size[0] = wg_number * wg_size[0];
-        work_size[1] = wg_number * wg_size[1];
-        flat_work_size = work_size[0] * work_size[1];
-    }
-    // 1D
-    else
-    {
-        size_t wg_number = static_cast<size_t>(
-            std::ceil(static_cast<double>(count) / wg_size[0])
-        );
-        flat_work_size = wg_number * wg_size[0];
-        work_size[0] = flat_work_size;
-    }
-
-    std::vector<cl_uint> input = generate_input_wg_broadcast(flat_work_size, flat_wg_size);
-    std::vector<cl_uint> output = generate_output_wg_broadcast(flat_work_size, flat_wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(cl_uint) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, work_size, wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    int result = CL_SUCCESS;
-    // 3D
-    if(dim > 2)
-    {
-        result = verify_wg_broadcast_3D(
-            input, output,
-            work_size[0], work_size[1], work_size[2],
-            wg_size[0], wg_size[1], wg_size[2]
-        );
-    }
-    // 2D
-    else if(dim > 1)
-    {
-        result = verify_wg_broadcast_2D(
-            input, output,
-            work_size[0], work_size[1],
-            wg_size[0], wg_size[1]
-        );
-    }
-    // 1D
-    else
-    {
-        result = verify_wg_broadcast_1D(
-            input, output,
-            work_size[0],
-            wg_size[0]
-        );
-    }
-
-    RETURN_ON_ERROR_MSG(result, "work_group_broadcast_%luD failed", dim);
-    log_info("work_group_broadcast_%luD passed\n", dim);
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_work_group_broadcast)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_broadcast(device, context, queue, n_elems, 1);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_broadcast(device, context, queue, n_elems, 2);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_broadcast(device, context, queue, n_elems, 3);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP
diff --git a/test_conformance/clcpp/workgroups/test_wg_reduce.hpp b/test_conformance/clcpp/workgroups/test_wg_reduce.hpp
deleted file mode 100644
index 160b2e8655..0000000000
--- a/test_conformance/clcpp/workgroups/test_wg_reduce.hpp
+++ /dev/null
@@ -1,334 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_REDUCE_HPP
-#define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_REDUCE_HPP
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of work-group functions
-#include "common.hpp"
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_wg_reduce_kernel_code()
-{
-    return
-        "__kernel void test_wg_reduce(global " + type_name<CL_INT_TYPE>() + " *input, global " + type_name<CL_INT_TYPE>() + " *output)\n"
-        "{\n"
-        "    ulong tid = get_global_id(0);\n"
-        "\n"
-        "    " + type_name<CL_INT_TYPE>() + " result = work_group_reduce_" + to_string(op) + "(input[tid]);\n"
-        "    output[tid] = result;\n"
-        "}\n";
-}
-#else
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_wg_reduce_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_reduce(global_ptr<" + type_name<CL_INT_TYPE>() + "[]> input, "
-                                        "global_ptr<" + type_name<CL_INT_TYPE>() + "[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    " + type_name<CL_INT_TYPE>() + " result = work_group_reduce<work_group_op::" + to_string(op) + ">(input[tid]);\n"
-           "    output[tid] = result;\n"
-           "}\n";
-}
-#endif
-
-template <class CL_INT_TYPE>
-int verify_wg_reduce_add(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE sum = 0;
-        // Work-group sum
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-            sum += in[i + j];
-
-        // Check if all work-items in work-group stored correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            if (sum != out[i + j])
-            {
-                log_info(
-                    "work_group_reduce_add %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(sum),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return 0;
-}
-
-template <class CL_INT_TYPE>
-int verify_wg_reduce_min(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE min = (std::numeric_limits<CL_INT_TYPE>::max)();
-        // Work-group min
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-            min = std::min<CL_INT_TYPE>(min, in[i + j]);
-
-        // Check if all work-items in work-group stored correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            if (min != out[i + j])
-            {
-                log_info(
-                    "work_group_reduce_min %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(min),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return 0;
-}
-
-template <class CL_INT_TYPE>
-int verify_wg_reduce_max(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE max = (std::numeric_limits<CL_INT_TYPE>::min)();
-        // Work-group max
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-            max = std::max<CL_INT_TYPE>(max, in[i + j]);
-
-        // Check if all work-items in work-group stored correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            if (max != out[i + j])
-            {
-                log_info(
-                    "work_group_reduce_max %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(max),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return 0;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int verify_wg_reduce(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return verify_wg_reduce_add(in, out, wg_size);
-        case work_group_op::min:
-            return verify_wg_reduce_min(in, out, wg_size);
-        case work_group_op::max:
-            return verify_wg_reduce_max(in, out, wg_size);
-    }
-    return -1;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int work_group_reduce(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    // don't run test for unsupported types
-    if(!type_supported<CL_INT_TYPE>(device))
-    {
-        return CL_SUCCESS;
-    }
-
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_wg_reduce_kernel_code<CL_INT_TYPE, op>();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_reduce");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_reduce", "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_reduce");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<CL_INT_TYPE> input = generate_input<CL_INT_TYPE, op>(flat_work_size, wg_size);
-    std::vector<CL_INT_TYPE> output = generate_output<CL_INT_TYPE, op>(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(CL_INT_TYPE) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(CL_INT_TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(CL_INT_TYPE) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(CL_INT_TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_wg_reduce<CL_INT_TYPE, op>(input, output, wg_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "work_group_reduce_%s %s failed", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-    }
-    log_info("work_group_reduce_%s %s passed\n", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_work_group_reduce_add)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_reduce<cl_int, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_uint, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_long, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_ulong, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_work_group_reduce_min)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_reduce<cl_int, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_uint, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_long, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_ulong, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_work_group_reduce_max)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_reduce<cl_int, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_uint, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_long, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_reduce<cl_ulong, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_REDUCE_HPP
diff --git a/test_conformance/clcpp/workgroups/test_wg_scan_exclusive.hpp b/test_conformance/clcpp/workgroups/test_wg_scan_exclusive.hpp
deleted file mode 100644
index ef0e8ffc30..0000000000
--- a/test_conformance/clcpp/workgroups/test_wg_scan_exclusive.hpp
+++ /dev/null
@@ -1,327 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_SCAN_EXCLUSIVE_HPP
-#define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_SCAN_EXCLUSIVE_HPP
-
-#include <vector>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of work-group functions
-#include "common.hpp"
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_wg_scan_exclusive_kernel_code()
-{
-    return
-        "__kernel void test_wg_scan_exclusive(global " + type_name<CL_INT_TYPE>() + " *input, global " + type_name<CL_INT_TYPE>() + " *output)\n"
-        "{\n"
-        "    ulong tid = get_global_id(0);\n"
-        "\n"
-        "    " + type_name<CL_INT_TYPE>() + " result = work_group_scan_exclusive_" + to_string(op) + "(input[tid]);\n"
-        "    output[tid] = result;\n"
-        "}\n";
-}
-#else
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_wg_scan_exclusive_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_scan_exclusive(global_ptr<" + type_name<CL_INT_TYPE>() + "[]> input, "
-                                                "global_ptr<" + type_name<CL_INT_TYPE>() + "[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    " + type_name<CL_INT_TYPE>() + " result = work_group_scan_exclusive<work_group_op::" + to_string(op) + ">(input[tid]);\n"
-           "    output[tid] = result;\n"
-           "}\n";
-}
-#endif
-
-template <class CL_INT_TYPE>
-int verify_wg_scan_exclusive_add(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE sum = 0;
-
-        // Check if all work-items in work-group wrote correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            if (sum != out[i + j])
-            {
-                log_info(
-                    "work_group_scan_exclusive_add %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(sum),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-            sum += in[i + j];
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_wg_scan_exclusive_min(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE min = (std::numeric_limits<CL_INT_TYPE>::max)();
-
-        // Check if all work-items in work-group wrote correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            if (min != out[i + j])
-            {
-                log_info(
-                    "work_group_scan_exclusive_min %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(min),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-            min = (std::min)(min, in[i + j]);
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_wg_scan_exclusive_max(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE max = (std::numeric_limits<CL_INT_TYPE>::min)();
-
-        // Check if all work-items in work-group wrote correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            if (max != out[i + j])
-            {
-                log_info(
-                    "work_group_scan_exclusive_max %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(max),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-            max = (std::max)(max, in[i + j]);
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int verify_wg_scan_exclusive(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return verify_wg_scan_exclusive_add(in, out, wg_size);
-        case work_group_op::min:
-            return verify_wg_scan_exclusive_min(in, out, wg_size);
-        case work_group_op::max:
-            return verify_wg_scan_exclusive_max(in, out, wg_size);
-    }
-    return -1;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int work_group_scan_exclusive(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    // don't run test for unsupported types
-    if(!type_supported<CL_INT_TYPE>(device))
-    {
-        return CL_SUCCESS;
-    }
-
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_wg_scan_exclusive_kernel_code<CL_INT_TYPE, op>();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_scan_exclusive");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_scan_exclusive", "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_scan_exclusive");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<CL_INT_TYPE> input = generate_input<CL_INT_TYPE, op>(flat_work_size, wg_size);
-    std::vector<CL_INT_TYPE> output = generate_output<CL_INT_TYPE, op>(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(CL_INT_TYPE) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(CL_INT_TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(CL_INT_TYPE) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(CL_INT_TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_wg_scan_exclusive<CL_INT_TYPE, op>(input, output, wg_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "work_group_scan_exclusive_%s %s failed", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-    }
-    log_info("work_group_scan_exclusive_%s %s passed\n", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_work_group_scan_exclusive_add)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_scan_exclusive<cl_int, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_uint, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_long, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_ulong, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_work_group_scan_exclusive_min)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_scan_exclusive<cl_int, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_uint, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_long, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_ulong, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_work_group_scan_exclusive_max)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_scan_exclusive<cl_int, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_uint, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_long, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_exclusive<cl_ulong, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_SCAN_EXCLUSIVE_HPP
diff --git a/test_conformance/clcpp/workgroups/test_wg_scan_inclusive.hpp b/test_conformance/clcpp/workgroups/test_wg_scan_inclusive.hpp
deleted file mode 100644
index 5623aed7f8..0000000000
--- a/test_conformance/clcpp/workgroups/test_wg_scan_inclusive.hpp
+++ /dev/null
@@ -1,327 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_SCAN_INCLUSIVE_HPP
-#define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_SCAN_INCLUSIVE_HPP
-
-#include <vector>
-#include <algorithm>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-// Common for tests of work-group functions
-#include "common.hpp"
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_wg_scan_inclusive_kernel_code()
-{
-    return
-        "__kernel void test_wg_scan_inclusive(global " + type_name<CL_INT_TYPE>() + " *input, global " + type_name<CL_INT_TYPE>() + " *output)\n"
-        "{\n"
-        "    ulong tid = get_global_id(0);\n"
-        "\n"
-        "    " + type_name<CL_INT_TYPE>() + " result = work_group_scan_inclusive_" + to_string(op) + "(input[tid]);\n"
-        "    output[tid] = result;\n"
-        "}\n";
-}
-#else
-template <class CL_INT_TYPE, work_group_op op>
-std::string generate_wg_scan_inclusive_kernel_code()
-{
-    return "#include <opencl_memory>\n"
-           "#include <opencl_work_item>\n"
-           "#include <opencl_work_group>\n"
-           "using namespace cl;\n"
-           "__kernel void test_wg_scan_inclusive(global_ptr<" + type_name<CL_INT_TYPE>() + "[]> input, "
-                                                "global_ptr<" + type_name<CL_INT_TYPE>() + "[]> output)\n"
-           "{\n"
-           "    ulong tid = get_global_id(0);\n"
-           "    " + type_name<CL_INT_TYPE>() + " result = work_group_scan_inclusive<work_group_op::" + to_string(op) + ">(input[tid]);\n"
-           "    output[tid] = result;\n"
-           "}\n";
-}
-#endif
-
-template <class CL_INT_TYPE>
-int verify_wg_scan_inclusive_add(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE sum = 0;
-
-        // Check if all work-items in work-group wrote correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            sum += in[i + j];
-            if (sum != out[i + j])
-            {
-                log_info(
-                    "work_group_scan_inclusive_add %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(sum),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_wg_scan_inclusive_min(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE min = (std::numeric_limits<CL_INT_TYPE>::max)();
-
-        // Check if all work-items in work-group wrote correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            min = (std::min)(min, in[i + j]);
-            if (min != out[i + j])
-            {
-                log_info(
-                    "work_group_scan_inclusive_min %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(min),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE>
-int verify_wg_scan_inclusive_max(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    size_t i, j;
-    for (i = 0; i < in.size(); i += wg_size)
-    {
-        CL_INT_TYPE max = (std::numeric_limits<CL_INT_TYPE>::min)();
-
-        // Check if all work-items in work-group wrote correct value
-        for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++)
-        {
-            max = (std::max)(max, in[i + j]);
-            if (max != out[i + j])
-            {
-                log_info(
-                    "work_group_scan_inclusive_max %s: Error at %lu: expected = %lu, got = %lu\n",
-                    type_name<CL_INT_TYPE>().c_str(),
-                    i + j,
-                    static_cast<size_t>(max),
-                    static_cast<size_t>(out[i + j]));
-                return -1;
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int verify_wg_scan_inclusive(const std::vector<CL_INT_TYPE> &in, const std::vector<CL_INT_TYPE> &out, size_t wg_size)
-{
-    switch (op)
-    {
-        case work_group_op::add:
-            return verify_wg_scan_inclusive_add(in, out, wg_size);
-        case work_group_op::min:
-            return verify_wg_scan_inclusive_min(in, out, wg_size);
-        case work_group_op::max:
-            return verify_wg_scan_inclusive_max(in, out, wg_size);
-    }
-    return -1;
-}
-
-template <class CL_INT_TYPE, work_group_op op>
-int work_group_scan_inclusive(cl_device_id device, cl_context context, cl_command_queue queue, size_t count)
-{
-    // don't run test for unsupported types
-    if(!type_supported<CL_INT_TYPE>(device))
-    {
-        return CL_SUCCESS;
-    }
-
-    cl_mem buffers[2];
-    cl_program program;
-    cl_kernel kernel;
-    size_t wg_size;
-    size_t work_size[1];
-    int err;
-
-    std::string code_str = generate_wg_scan_inclusive_kernel_code<CL_INT_TYPE, op>();
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_scan_inclusive");
-    RETURN_ON_ERROR(err)
-    return err;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_scan_inclusive", "-cl-std=CL2.0", false);
-    RETURN_ON_ERROR(err)
-#else
-    err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_scan_inclusive");
-    RETURN_ON_ERROR(err)
-#endif
-
-    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-    RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")
-
-    // Calculate global work size
-    size_t flat_work_size;
-    size_t wg_number = static_cast<size_t>(
-        std::ceil(static_cast<double>(count) / wg_size)
-    );
-    flat_work_size = wg_number * wg_size;
-    work_size[0] = flat_work_size;
-
-    std::vector<CL_INT_TYPE> input = generate_input<CL_INT_TYPE, op>(flat_work_size, wg_size);
-    std::vector<CL_INT_TYPE> output = generate_output<CL_INT_TYPE, op>(flat_work_size, wg_size);
-
-    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
-                                sizeof(CL_INT_TYPE) * input.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    buffers[1] =
-        clCreateBuffer(context, CL_MEM_READ_WRITE,
-                       sizeof(CL_INT_TYPE) * output.size(), NULL, &err);
-    RETURN_ON_CL_ERROR(err, "clCreateBuffer");
-
-    err = clEnqueueWriteBuffer(
-        queue, buffers[0], CL_TRUE, 0, sizeof(CL_INT_TYPE) * input.size(),
-        static_cast<void *>(input.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");
-
-    err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
-    err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
-    RETURN_ON_CL_ERROR(err, "clSetKernelArg");
-
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL);
-    RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");
-
-    err = clEnqueueReadBuffer(
-        queue, buffers[1], CL_TRUE, 0, sizeof(CL_INT_TYPE) * output.size(),
-        static_cast<void *>(output.data()), 0, NULL, NULL
-    );
-    RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");
-
-    if (verify_wg_scan_inclusive<CL_INT_TYPE, op>(input, output, wg_size) != CL_SUCCESS)
-    {
-        RETURN_ON_ERROR_MSG(-1, "work_group_scan_inclusive_%s %s failed", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-    }
-    log_info("work_group_scan_inclusive_%s %s passed\n", to_string(op).c_str(), type_name<CL_INT_TYPE>().c_str());
-
-    clReleaseMemObject(buffers[0]);
-    clReleaseMemObject(buffers[1]);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return err;
-}
-
-AUTO_TEST_CASE(test_work_group_scan_inclusive_add)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_scan_inclusive<cl_int, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_uint, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_long, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_ulong, work_group_op::add>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_work_group_scan_inclusive_min)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_scan_inclusive<cl_int, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_uint, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_long, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_ulong, work_group_op::min>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-AUTO_TEST_CASE(test_work_group_scan_inclusive_max)
-(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
-{
-    int error = CL_SUCCESS;
-    int local_error = CL_SUCCESS;
-
-    local_error = work_group_scan_inclusive<cl_int, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_uint, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_long, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    local_error = work_group_scan_inclusive<cl_ulong, work_group_op::max>(device, context, queue, n_elems);
-    CHECK_ERROR(local_error)
-    error |= local_error;
-
-    if(error != CL_SUCCESS)
-        return -1;
-    return CL_SUCCESS;
-}
-
-#endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_SCAN_INCLUSIVE_HPP
diff --git a/test_conformance/clcpp/workitems/CMakeLists.txt b/test_conformance/clcpp/workitems/CMakeLists.txt
deleted file mode 100644
index 00359334fb..0000000000
--- a/test_conformance/clcpp/workitems/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(MODULE_NAME CPP_WORKITEMS)
-
-set(${MODULE_NAME}_SOURCES
-    main.cpp
-)
-
-include(../../CMakeCommon.txt)
diff --git a/test_conformance/clcpp/workitems/main.cpp b/test_conformance/clcpp/workitems/main.cpp
deleted file mode 100644
index aacbdd4973..0000000000
--- a/test_conformance/clcpp/workitems/main.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "../common.hpp"
-
-#include "test_workitems.hpp"
-
-
-int main(int argc, const char *argv[])
-{
-    auto& tests = autotest::test_suite::global_test_suite().test_defs;
-    return runTestHarness(argc, argv, tests.size(), tests.data(), false, 0);
-}
diff --git a/test_conformance/clcpp/workitems/test_workitems.hpp b/test_conformance/clcpp/workitems/test_workitems.hpp
deleted file mode 100644
index 099ef34485..0000000000
--- a/test_conformance/clcpp/workitems/test_workitems.hpp
+++ /dev/null
@@ -1,417 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#ifndef TEST_CONFORMANCE_CLCPP_WI_TEST_WORKITEMS_HPP
-#define TEST_CONFORMANCE_CLCPP_WI_TEST_WORKITEMS_HPP
-
-#include <vector>
-#include <algorithm>
-#include <random>
-
-// Common for all OpenCL C++ tests
-#include "../common.hpp"
-
-
-namespace test_workitems {
-
-struct test_options
-{
-    bool uniform_work_group_size;
-    size_t max_count;
-    size_t num_tests;
-};
-
-struct output_type
-{
-    cl_uint  work_dim;
-    cl_ulong global_size[3];
-    cl_ulong global_id[3];
-    cl_ulong local_size[3];
-    cl_ulong enqueued_local_size[3];
-    cl_ulong local_id[3];
-    cl_ulong num_groups[3];
-    cl_ulong group_id[3];
-    cl_ulong global_offset[3];
-    cl_ulong global_linear_id;
-    cl_ulong local_linear_id;
-    cl_ulong sub_group_size;
-    cl_ulong max_sub_group_size;
-    cl_ulong num_sub_groups;
-    cl_ulong enqueued_num_sub_groups;
-    cl_ulong sub_group_id;
-    cl_ulong sub_group_local_id;
-};
-
-const std::string source_common = R"(
-struct output_type
-{
-    uint  work_dim;
-    ulong global_size[3];
-    ulong global_id[3];
-    ulong local_size[3];
-    ulong enqueued_local_size[3];
-    ulong local_id[3];
-    ulong num_groups[3];
-    ulong group_id[3];
-    ulong global_offset[3];
-    ulong global_linear_id;
-    ulong local_linear_id;
-    ulong sub_group_size;
-    ulong max_sub_group_size;
-    ulong num_sub_groups;
-    ulong enqueued_num_sub_groups;
-    ulong sub_group_id;
-    ulong sub_group_local_id;
-};
-)";
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-const std::string source =
-    source_common +
-    R"(
-        #ifdef cl_khr_subgroups
-        #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-        #endif
-
-        kernel void test(global struct output_type *output)
-        {
-           const ulong gid = get_global_linear_id();
-           output[gid].work_dim = get_work_dim();
-           for (uint dimindx = 0; dimindx < 3; dimindx++)
-           {
-               output[gid].global_size[dimindx] = get_global_size(dimindx);
-               output[gid].global_id[dimindx] = get_global_id(dimindx);
-               output[gid].local_size[dimindx] = get_local_size(dimindx);
-               output[gid].enqueued_local_size[dimindx] = get_enqueued_local_size(dimindx);
-               output[gid].local_id[dimindx] = get_local_id(dimindx);
-               output[gid].num_groups[dimindx] = get_num_groups(dimindx);
-               output[gid].group_id[dimindx] = get_group_id(dimindx);
-               output[gid].global_offset[dimindx] = get_global_offset(dimindx);
-           }
-           output[gid].global_linear_id = get_global_linear_id();
-           output[gid].local_linear_id = get_local_linear_id();
-        #ifdef cl_khr_subgroups
-           output[gid].sub_group_size = get_sub_group_size();
-           output[gid].max_sub_group_size = get_max_sub_group_size();
-           output[gid].num_sub_groups = get_num_sub_groups();
-           output[gid].enqueued_num_sub_groups = get_enqueued_num_sub_groups();
-           output[gid].sub_group_id = get_sub_group_id();
-           output[gid].sub_group_local_id = get_sub_group_local_id();
-        #endif
-        }
-    )";
-#else
-const std::string source =
-    R"(
-        #include <opencl_memory>
-        #include <opencl_work_item>
-        using namespace cl;
-    )" +
-    source_common +
-    R"(
-
-        kernel void test(global_ptr<output_type[]> output)
-        {
-           const size_t gid = get_global_linear_id();
-           output[gid].work_dim = get_work_dim();
-           for (uint dimindx = 0; dimindx < 3; dimindx++)
-           {
-               output[gid].global_size[dimindx] = get_global_size(dimindx);
-               output[gid].global_id[dimindx] = get_global_id(dimindx);
-               output[gid].local_size[dimindx] = get_local_size(dimindx);
-               output[gid].enqueued_local_size[dimindx] = get_enqueued_local_size(dimindx);
-               output[gid].local_id[dimindx] = get_local_id(dimindx);
-               output[gid].num_groups[dimindx] = get_num_groups(dimindx);
-               output[gid].group_id[dimindx] = get_group_id(dimindx);
-               output[gid].global_offset[dimindx] = get_global_offset(dimindx);
-           }
-           output[gid].global_linear_id = get_global_linear_id();
-           output[gid].local_linear_id = get_local_linear_id();
-           output[gid].sub_group_size = get_sub_group_size();
-           output[gid].max_sub_group_size = get_max_sub_group_size();
-           output[gid].num_sub_groups = get_num_sub_groups();
-           output[gid].enqueued_num_sub_groups = get_enqueued_num_sub_groups();
-           output[gid].sub_group_id = get_sub_group_id();
-           output[gid].sub_group_local_id = get_sub_group_local_id();
-        }
-
-    )";
-#endif
-
-#define CHECK_EQUAL(result, expected, func_name) \
-    if (result != expected) \
-    { \
-        RETURN_ON_ERROR_MSG(-1, \
-            "Function %s failed. Expected: %s, got: %s", func_name, \
-            format_value(expected).c_str(), format_value(result).c_str() \
-        ); \
-    }
-
-#define CHECK(expression, func_name) \
-    if (expression) \
-    { \
-        RETURN_ON_ERROR_MSG(-1, \
-            "Function %s returned incorrect result", func_name \
-        ); \
-    }
-
-int test_workitems(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
-{
-    int error = CL_SUCCESS;
-
-    cl_program program;
-    cl_kernel kernel;
-
-    std::string kernel_name = "test";
-
-// -----------------------------------------------------------------------------------
-// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
-// -----------------------------------------------------------------------------------
-// Only OpenCL C++ to SPIR-V compilation
-#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-    return error;
-// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
-#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name, "-cl-std=CL2.0", false
-    );
-    RETURN_ON_ERROR(error)
-// Normal run
-#else
-    error = create_opencl_kernel(
-        context, &program, &kernel,
-        source, kernel_name
-    );
-    RETURN_ON_ERROR(error)
-#endif
-
-    size_t max_work_group_size;
-    size_t max_local_sizes[3];
-    error = get_max_allowed_work_group_size(context, kernel, &max_work_group_size, max_local_sizes);
-    RETURN_ON_ERROR(error)
-
-    bool check_sub_groups = true;
-    bool check_sub_groups_limits = true;
-#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
-    check_sub_groups = false;
-    check_sub_groups_limits = false;
-    if (is_extension_available(device, "cl_khr_subgroups"))
-    {
-        Version version = get_device_cl_version(device);
-        RETURN_ON_ERROR(error)
-        check_sub_groups_limits = (version >= Version(2,1)); // clGetKernelSubGroupInfo is from 2.1
-        check_sub_groups = true;
-    }
-#endif
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<size_t> count_dis(1, options.max_count);
-
-    for (int test = 0; test < options.num_tests; test++)
-    {
-        for (size_t dim = 1; dim <= 3; dim++)
-        {
-            size_t global_size[3] = { 1, 1, 1 };
-            size_t global_offset[3] = { 0, 0, 0 };
-            size_t enqueued_local_size[3] = { 1, 1, 1 };
-            size_t count = count_dis(gen);
-            std::uniform_int_distribution<size_t> global_size_dis(1, static_cast<size_t>(pow(count, 1.0 / dim)));
-            for (int d = 0; d < dim; d++)
-            {
-                std::uniform_int_distribution<size_t> enqueued_local_size_dis(1, max_local_sizes[d]);
-                global_size[d] = global_size_dis(gen);
-                global_offset[d] = global_size_dis(gen);
-                enqueued_local_size[d] = enqueued_local_size_dis(gen);
-            }
-            // Local work size must not exceed CL_KERNEL_WORK_GROUP_SIZE for this kernel
-            while (enqueued_local_size[0] * enqueued_local_size[1] * enqueued_local_size[2] > max_work_group_size)
-            {
-                // otherwise decrease it until it fits
-                for (int d = 0; d < dim; d++)
-                {
-                    enqueued_local_size[d] = (std::max)((size_t)1, enqueued_local_size[d] / 2);
-                }
-            }
-            if (options.uniform_work_group_size)
-            {
-                for (int d = 0; d < dim; d++)
-                {
-                    global_size[d] = get_uniform_global_size(global_size[d], enqueued_local_size[d]);
-                }
-            }
-            count = global_size[0] * global_size[1] * global_size[2];
-
-            cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(output_type) * count, NULL, &error);
-            RETURN_ON_CL_ERROR(error, "clCreateBuffer")
-
-            const char pattern = 0;
-            error = clEnqueueFillBuffer(queue, output_buffer, &pattern, sizeof(pattern), 0, sizeof(output_type) * count, 0, NULL, NULL);
-            RETURN_ON_CL_ERROR(error, "clEnqueueFillBuffer")
-
-            error = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
-            RETURN_ON_CL_ERROR(error, "clSetKernelArg")
-
-            error = clEnqueueNDRangeKernel(queue, kernel, dim, global_offset, global_size, enqueued_local_size, 0, NULL, NULL);
-            RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
-
-            std::vector<output_type> output(count);
-            error = clEnqueueReadBuffer(
-                queue, output_buffer, CL_TRUE,
-                0, sizeof(output_type) * count,
-                static_cast<void *>(output.data()),
-                0, NULL, NULL
-            );
-            RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
-
-            error = clReleaseMemObject(output_buffer);
-            RETURN_ON_CL_ERROR(error, "clReleaseMemObject")
-
-            size_t sub_group_count_for_ndrange = 0;
-            size_t max_sub_group_size_for_ndrange = 0;
-            if (check_sub_groups_limits)
-            {
-                error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE,
-                    sizeof(size_t) * dim, enqueued_local_size,
-                    sizeof(size_t), &sub_group_count_for_ndrange, NULL);
-                RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
-
-                error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
-                    sizeof(size_t) * dim, enqueued_local_size,
-                    sizeof(size_t), &max_sub_group_size_for_ndrange, NULL);
-                RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
-            }
-
-            size_t num_groups[3];
-            for (int d = 0; d < 3; d++)
-                num_groups[d] = static_cast<size_t>(std::ceil(static_cast<double>(global_size[d]) / enqueued_local_size[d]));
-
-            size_t group_id[3];
-            for (group_id[0] = 0; group_id[0] < num_groups[0]; group_id[0]++)
-            for (group_id[1] = 0; group_id[1] < num_groups[1]; group_id[1]++)
-            for (group_id[2] = 0; group_id[2] < num_groups[2]; group_id[2]++)
-            {
-                size_t local_size[3];
-                for (int d = 0; d < 3; d++)
-                {
-                    if (group_id[d] == num_groups[d] - 1)
-                        local_size[d] = global_size[d] - group_id[d] * enqueued_local_size[d];
-                    else
-                        local_size[d] = enqueued_local_size[d];
-                }
-
-                size_t local_id[3];
-                for (local_id[0] = 0; local_id[0] < local_size[0]; local_id[0]++)
-                for (local_id[1] = 0; local_id[1] < local_size[1]; local_id[1]++)
-                for (local_id[2] = 0; local_id[2] < local_size[2]; local_id[2]++)
-                {
-                    size_t global_id_wo_offset[3];
-                    size_t global_id[3];
-                    for (int d = 0; d < 3; d++)
-                    {
-                        global_id_wo_offset[d] = group_id[d] * enqueued_local_size[d] + local_id[d];
-                        global_id[d] = global_id_wo_offset[d] + global_offset[d];
-                    }
-
-                    // Ignore if the current work-item is outside of global work size (i.e. the work-group is non-uniform)
-                    if (global_id_wo_offset[0] >= global_size[0] ||
-                        global_id_wo_offset[1] >= global_size[1] ||
-                        global_id_wo_offset[2] >= global_size[2]) break;
-
-                    const size_t global_linear_id =
-                        global_id_wo_offset[2] * global_size[1] * global_size[0] +
-                        global_id_wo_offset[1] * global_size[0] +
-                        global_id_wo_offset[0];
-                    const size_t local_linear_id =
-                        local_id[2] * local_size[1] * local_size[0] +
-                        local_id[1] * local_size[0] +
-                        local_id[0];
-
-                    const output_type &o = output[global_linear_id];
-
-                    CHECK_EQUAL(o.work_dim, dim, "get_work_dim")
-                    for (int d = 0; d < 3; d++)
-                    {
-                        CHECK_EQUAL(o.global_size[d], global_size[d], "get_global_size")
-                        CHECK_EQUAL(o.global_id[d], global_id[d], "get_global_id")
-                        CHECK_EQUAL(o.local_size[d], local_size[d], "get_local_size")
-                        CHECK_EQUAL(o.enqueued_local_size[d], enqueued_local_size[d], "get_enqueued_local_size")
-                        CHECK_EQUAL(o.local_id[d], local_id[d], "get_local_id")
-                        CHECK_EQUAL(o.num_groups[d], num_groups[d], "get_num_groups")
-                        CHECK_EQUAL(o.group_id[d], group_id[d], "get_group_id")
-                        CHECK_EQUAL(o.global_offset[d], global_offset[d], "get_global_offset")
-                    }
-
-                    CHECK_EQUAL(o.global_linear_id, global_linear_id, "get_global_linear_id")
-                    CHECK_EQUAL(o.local_linear_id, local_linear_id, "get_local_linear_id")
-
-                    // A few (but not all possible) sub-groups related checks
-                    if (check_sub_groups)
-                    {
-                        if (check_sub_groups_limits)
-                        {
-                            CHECK_EQUAL(o.max_sub_group_size, max_sub_group_size_for_ndrange, "get_max_sub_group_size")
-                            CHECK_EQUAL(o.enqueued_num_sub_groups, sub_group_count_for_ndrange, "get_enqueued_num_sub_groups")
-                        }
-                        CHECK(o.sub_group_size == 0 || o.sub_group_size > o.max_sub_group_size, "get_sub_group_size or get_max_sub_group_size")
-                        CHECK(o.num_sub_groups == 0 || o.num_sub_groups > o.enqueued_num_sub_groups, "get_enqueued_num_sub_groups")
-                        CHECK(o.sub_group_id >= o.num_sub_groups, "get_sub_group_id or get_num_sub_groups")
-                        CHECK(o.sub_group_local_id >= o.sub_group_size, "get_sub_group_local_id or get_sub_group_size")
-                    }
-                }
-            }
-        }
-    }
-
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    return error;
-}
-
-#undef CHECK_EQUAL
-#undef CHECK
-
-AUTO_TEST_CASE(test_workitems_uniform)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.uniform_work_group_size = true;
-    options.max_count = num_elements;
-    options.num_tests = 1000;
-    return test_workitems(device, context, queue, options);
-}
-
-AUTO_TEST_CASE(test_workitems_non_uniform)
-(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
-{
-    test_options options;
-    options.uniform_work_group_size = false;
-    options.max_count = num_elements;
-    options.num_tests = 1000;
-    return test_workitems(device, context, queue, options);
-}
-
-} // namespace
-
-#endif // TEST_CONFORMANCE_CLCPP_WI_TEST_WORKITEMS_HPP
diff --git a/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp b/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp
index 4b03b54ae1..1d5252cb24 100644
--- a/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp
+++ b/test_conformance/extensions/cl_ext_cxx_for_opencl/cxx_for_opencl_ext.cpp
@@ -42,8 +42,7 @@ int test_cxx_for_opencl(cl_device_id device, cl_context context,
         })";
 
     error = create_single_kernel_helper_with_build_options(
-        context, &program, &kernel1, 1, &kernel_sstr, "k1", "-cl-std=CLC++",
-        false);
+        context, &program, &kernel1, 1, &kernel_sstr, "k1", "-cl-std=CLC++");
     test_error(error, "Failed to create k1 kernel");
 
     kernel2 = clCreateKernel(program, "k2", &error);

From a43d96de6932520e8e67408f0869e864e0a7e9a1 Mon Sep 17 00:00:00 2001
From: Chetan Mistry <70694498+chemis01@users.noreply.github.com>
Date: Thu, 13 May 2021 09:18:12 +0100
Subject: [PATCH 087/158] Redesign clGetKernelArgInfo (#522) (#1056)

* Improve Functionality of Harness

In the harness we previously were able to determine whether or
not a device supports the half or double data types, but doing so
required unintuitive function calls and would need to be repeated
per test.
A new pair of functions have been added which clearly state
what they do, and makes it easier to determine whether or not
a device supports the types.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* Remove Old GetKernelArgInfo Tests (#522)

In the API test suite we have 2 versions which test the
clGetKernelArgInfo API. As part of this ticket we are redesigning
the implementation of this test. This change removes all of
the old code and makes it so that the tests simply pass. A later
commit will add the redesigned test

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* Redesign GetKernelArgInfo (#522)

The previous test for this API consisted of 5K+ lines
of code which would define the test kernels and the
expected outputs from this API. This redesign
instead generates the kernels and expected outputs
leading to incresased maintanability and a significantly
reduce line-of-code count.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Address Review Comments

This commit does the following:
    1) Update the Copyright to 2021
    2) Fixes a typo in a comment
    3) Explicitly declares a vector variable
       (previously auto)
    4) Output subtest result after completion rather than
       all of them at the end

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Ensure Kernel Arguments do not exceed CL_DEVICE_MAX_PARAMETER_SIZE

As per upstream comments, this change ensures that the total
size of parameters passed into a kernel does not exceed the
limit specified by CL_DEVICE_MAX_PARAMETER_SIZE for the device
used.
Additionally this change replaces ASSERT_SUCCESS() with test_error()
as per upstream requests.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Address Image and Vector Failures

This change aligns vector 3 types to be sized 4.
Additionally it ensures that image arguments do not
have the address space qualifier specified because
they are by default in the __global space.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Ensure that the size of pipe arguments are correct

As mentioned in PR comments, the test previously assumed that
sizeof(char) == sizeof(pipe char). The Clang implementation
treats a pipe to take the same size as a pointer, which
is now reflected in the code.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Ensure that CL_DEVICE_MAX_PIPE_ARGS is not Exceeded

This commit refactors the code so that Pipes are handled
separately.
Additionally, it removes signed char and char signed as
scalar types to test and removes some redundent code
for modifiying the expected type when processing unsigned
scalar types.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Remove compatibility test from skip-list

There is a list of tests which should be skipped when
using an offline compiler. As get_kernel_arg_compatibility
has been removed, it should also be removed here.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Disable Pipe Tests

This change disables the Pipe tests for clGetKernelArgInfo
as pipe metadata is not accurately reported on clang
which leads to the pipe tests failing.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>
---
 test_common/harness/deviceInfo.cpp            |   12 +
 test_common/harness/deviceInfo.h              |    4 +
 test_common/harness/errorHelpers.cpp          |    1 -
 test_common/harness/kernelHelpers.cpp         |   23 +
 test_common/harness/kernelHelpers.h           |    8 +-
 test_conformance/api/CMakeLists.txt           |    1 -
 test_conformance/api/main.cpp                 |    1 -
 test_conformance/api/procs.h                  |    1 -
 test_conformance/api/test_kernel_arg_info.cpp | 6731 +++--------------
 .../test_kernel_arg_info_compatibility.cpp    | 5159 -------------
 10 files changed, 942 insertions(+), 10999 deletions(-)
 delete mode 100644 test_conformance/api/test_kernel_arg_info_compatibility.cpp

diff --git a/test_common/harness/deviceInfo.cpp b/test_common/harness/deviceInfo.cpp
index 12611873d9..287a142303 100644
--- a/test_common/harness/deviceInfo.cpp
+++ b/test_common/harness/deviceInfo.cpp
@@ -86,3 +86,15 @@ std::string get_device_name(cl_device_id device)
 {
     return get_device_info_string(device, CL_DEVICE_NAME);
 }
+
+size_t get_max_param_size(cl_device_id device)
+{
+    size_t ret(0);
+    if (clGetDeviceInfo(device, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof(ret), &ret,
+                        nullptr)
+        != CL_SUCCESS)
+    {
+        throw std::runtime_error("clGetDeviceInfo failed\n");
+    }
+    return ret;
+}
diff --git a/test_common/harness/deviceInfo.h b/test_common/harness/deviceInfo.h
index af923a2fa4..f8c55805cd 100644
--- a/test_common/harness/deviceInfo.h
+++ b/test_common/harness/deviceInfo.h
@@ -42,4 +42,8 @@ std::string get_device_version_string(cl_device_id device);
 
 /* Returns a string containing the device name. */
 std::string get_device_name(cl_device_id device);
+
+// Returns the maximum size in bytes for Kernel Parameters
+size_t get_max_param_size(cl_device_id device);
+
 #endif // _deviceInfo_h
diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index da1660f1e3..22a2677d04 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -623,7 +623,6 @@ cl_int OutputBuildLogs(cl_program program, cl_uint num_devices,
 
 const char *subtests_to_skip_with_offline_compiler[] = {
     "get_kernel_arg_info",
-    "get_kernel_arg_info_compatibility",
     "binary_create",
     "load_program_source",
     "load_multistring_source",
diff --git a/test_common/harness/kernelHelpers.cpp b/test_common/harness/kernelHelpers.cpp
index f2d2909dba..95b9555ed3 100644
--- a/test_common/harness/kernelHelpers.cpp
+++ b/test_common/harness/kernelHelpers.cpp
@@ -1756,3 +1756,26 @@ bool poll_until(unsigned timeout_ms, unsigned interval_ms,
 
     return ret;
 }
+
+bool device_supports_double(cl_device_id device)
+{
+    if (is_extension_available(device, "cl_khr_fp64"))
+    {
+        return true;
+    }
+    else
+    {
+        cl_device_fp_config double_fp_config;
+        cl_int err = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG,
+                                     sizeof(double_fp_config),
+                                     &double_fp_config, nullptr);
+        test_error(err,
+                   "clGetDeviceInfo for CL_DEVICE_DOUBLE_FP_CONFIG failed");
+        return double_fp_config != 0;
+    }
+}
+
+bool device_supports_half(cl_device_id device)
+{
+    return is_extension_available(device, "cl_khr_fp16");
+}
diff --git a/test_common/harness/kernelHelpers.h b/test_common/harness/kernelHelpers.h
index f971a8f2b0..4d8f2a8fa7 100644
--- a/test_common/harness/kernelHelpers.h
+++ b/test_common/harness/kernelHelpers.h
@@ -178,7 +178,7 @@ cl_device_fp_config get_default_rounding_mode(cl_device_id device);
     }
 
 #define PASSIVE_REQUIRE_FP16_SUPPORT(device)                                   \
-    if (!is_extension_available(device, "cl_khr_fp16"))                        \
+    if (!device_supports_half(device))                                         \
     {                                                                          \
         log_info(                                                              \
             "\n\tNote: device does not support fp16. Skipping test...\n");     \
@@ -208,4 +208,10 @@ bool device_supports_cl_c_version(cl_device_id device, Version version);
 bool poll_until(unsigned timeout_ms, unsigned interval_ms,
                 std::function<bool()> fn);
 
+// Checks whether the device supports double data types
+bool device_supports_double(cl_device_id device);
+
+// Checks whether the device supports half data types
+bool device_supports_half(cl_device_id device);
+
 #endif // _kernelHelpers_h
diff --git a/test_conformance/api/CMakeLists.txt b/test_conformance/api/CMakeLists.txt
index 20cb9b82dc..375e92d3d8 100644
--- a/test_conformance/api/CMakeLists.txt
+++ b/test_conformance/api/CMakeLists.txt
@@ -21,7 +21,6 @@ set(${MODULE_NAME}_SOURCES
          test_device_min_data_type_align_size_alignment.cpp
          test_platform.cpp
          test_kernel_arg_info.cpp
-         test_kernel_arg_info_compatibility.cpp
          test_null_buffer_arg.cpp
          test_mem_object_info.cpp
          test_min_image_formats.cpp
diff --git a/test_conformance/api/main.cpp b/test_conformance/api/main.cpp
index 16ca81c4c5..ef9f00cf2b 100644
--- a/test_conformance/api/main.cpp
+++ b/test_conformance/api/main.cpp
@@ -51,7 +51,6 @@ test_definition test_list[] = {
     ADD_TEST(load_two_kernels_manually),
     ADD_TEST(get_program_info_kernel_names),
     ADD_TEST(get_kernel_arg_info),
-    ADD_TEST(get_kernel_arg_info_compatibility),
     ADD_TEST(create_kernels_in_program),
     ADD_TEST(get_kernel_info),
     ADD_TEST(kernel_private_memory_size),
diff --git a/test_conformance/api/procs.h b/test_conformance/api/procs.h
index af373e43c3..e9c45c360d 100644
--- a/test_conformance/api/procs.h
+++ b/test_conformance/api/procs.h
@@ -119,7 +119,6 @@ extern int      test_get_image1d_info( cl_device_id deviceID, cl_context context
 extern int      test_get_image1d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
 extern int      test_get_image2d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
 extern int      test_get_kernel_arg_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
-extern int      test_get_kernel_arg_info_compatibility( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
 extern int      test_queue_hint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_sub_group_dispatch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_clone_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
diff --git a/test_conformance/api/test_kernel_arg_info.cpp b/test_conformance/api/test_kernel_arg_info.cpp
index f1039aeccb..8073e0defe 100644
--- a/test_conformance/api/test_kernel_arg_info.cpp
+++ b/test_conformance/api/test_kernel_arg_info.cpp
@@ -1,5943 +1,1004 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "testBase.h"
-#include <limits.h>
-#include <ctype.h>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#define ARG_INFO_FIELD_COUNT        5
-
-#define ARG_INFO_ADDR_OFFSET        1
-#define ARG_INFO_ACCESS_OFFSET        2
-#define ARG_INFO_TYPE_QUAL_OFFSET    3
-#define ARG_INFO_TYPE_NAME_OFFSET    4
-#define ARG_INFO_ARG_NAME_OFFSET    5
-
-
-typedef char const * kernel_args_t[];
-
-kernel_args_t required_kernel_args = {
-    "typedef float4 typedef_type;\n"
-    "\n"
-    "typedef struct struct_type {\n"
-    "    float4 float4d;\n"
-    "    int intd;\n"
-    "} typedef_struct_type;\n"
-    "\n"
-    "typedef union union_type {\n"
-    "    float4 float4d;\n"
-    "    uint4 uint4d;\n"
-    "} typedef_union_type;\n"
-    "\n"
-    "typedef enum enum_type {\n"
-    "    enum_type_zero,\n"
-    "    enum_type_one,\n"
-    "    enum_type_two\n"
-    "} typedef_enum_type;\n"
-    "\n"
-    "kernel void constant_scalar_p0(constant void*constantvoidp,\n"
-    "                              constant char *constantcharp,\n"
-    "                              constant uchar* constantucharp,\n"
-    "                              constant unsigned char * constantunsignedcharp)\n"
-  "{}\n",
-    "kernel void constant_scalar_p1(constant short*constantshortp,\n"
-    "                              constant ushort *constantushortp,\n"
-    "                              constant unsigned short* constantunsignedshortp,\n"
-    "                              constant int * constantintp)\n"
-  "{}\n",
-    "kernel void constant_scalar_p2(constant uint*constantuintp,\n"
-    "                              constant unsigned int *constantunsignedintp)\n"
-  "{}\n",
-    "kernel void constant_scalar_p3(constant float *constantfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_scalar_restrict_p0(constant void* restrict constantvoidrestrictp,\n"
-    "                                       constant char * restrict constantcharrestrictp,\n"
-    "                                       constant uchar*restrict constantucharrestrictp,\n"
-    "                                       constant unsigned char *restrict constantunsignedcharrestrictp)\n"
-    "{}\n",
-    "kernel void constant_scalar_restrict_p1(constant short* restrict constantshortrestrictp,\n"
-    "                                       constant ushort * restrict constantushortrestrictp,\n"
-    "                                       constant unsigned short*restrict constantunsignedshortrestrictp,\n"
-    "                                       constant int *restrict constantintrestrictp)\n"
-    "{}\n",
-    "kernel void constant_scalar_restrict_p2(constant uint* restrict constantuintrestrictp,\n"
-    "                                       constant unsigned int * restrict constantunsignedintrestrictp)\n"
-    "{}\n",
-    "kernel void constant_scalar_restrict_p3(constant float * restrict constantfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_scalar_p(global void*globalvoidp,\n"
-    "                            global char *globalcharp,\n"
-    "                            global uchar* globalucharp,\n"
-    "                            global unsigned char * globalunsignedcharp,\n"
-    "                            global short*globalshortp,\n"
-    "                            global ushort *globalushortp,\n"
-    "                            global unsigned short* globalunsignedshortp,\n"
-    "                            global int * globalintp,\n"
-    "                            global uint*globaluintp,\n"
-    "                            global unsigned int *globalunsignedintp,\n"
-    "                            global float *globalfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_scalar_restrict_p(global void* restrict globalvoidrestrictp,\n"
-    "                                     global char * restrict globalcharrestrictp,\n"
-    "                                     global uchar*restrict globalucharrestrictp,\n"
-    "                                     global unsigned char *restrict globalunsignedcharrestrictp,\n"
-    "                                     global short* restrict globalshortrestrictp,\n"
-    "                                     global ushort * restrict globalushortrestrictp,\n"
-    "                                     global unsigned short*restrict globalunsignedshortrestrictp,\n"
-    "                                     global int *restrict globalintrestrictp,\n"
-    "                                     global uint* restrict globaluintrestrictp,\n"
-    "                                     global unsigned int * restrict globalunsignedintrestrictp,\n"
-    "                                     global float * restrict globalfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_scalar_p(global const void*globalconstvoidp,\n"
-    "                                  global const char *globalconstcharp,\n"
-    "                                  global const uchar* globalconstucharp,\n"
-    "                                  global const unsigned char * globalconstunsignedcharp,\n"
-    "                                  global const short*globalconstshortp,\n"
-    "                                  global const ushort *globalconstushortp,\n"
-    "                                  global const unsigned short* globalconstunsignedshortp,\n"
-    "                                  global const int * globalconstintp,\n"
-    "                                  global const uint*globalconstuintp,\n"
-    "                                  global const unsigned int *globalconstunsignedintp,\n"
-    "                                  global const float *globalconstfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_scalar_restrict_p(global const void* restrict globalconstvoidrestrictp,\n"
-    "                                           global const char * restrict globalconstcharrestrictp,\n"
-    "                                           global const uchar*restrict globalconstucharrestrictp,\n"
-    "                                           global const unsigned char *restrict globalconstunsignedcharrestrictp,\n"
-    "                                           global const short* restrict globalconstshortrestrictp,\n"
-    "                                           global const ushort * restrict globalconstushortrestrictp,\n"
-    "                                           global const unsigned short*restrict globalconstunsignedshortrestrictp,\n"
-    "                                           global const int *restrict globalconstintrestrictp,\n"
-    "                                           global const uint* restrict globalconstuintrestrictp,\n"
-    "                                           global const unsigned int * restrict globalconstunsignedintrestrictp,\n"
-    "                                           global const float * restrict globalconstfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_scalar_p(global volatile void*globalvolatilevoidp,\n"
-    "                                     global volatile char *globalvolatilecharp,\n"
-    "                                     global volatile uchar* globalvolatileucharp,\n"
-    "                                     global volatile unsigned char * globalvolatileunsignedcharp,\n"
-    "                                     global volatile short*globalvolatileshortp,\n"
-    "                                     global volatile ushort *globalvolatileushortp,\n"
-    "                                     global volatile unsigned short* globalvolatileunsignedshortp,\n"
-    "                                     global volatile int * globalvolatileintp,\n"
-    "                                     global volatile uint*globalvolatileuintp,\n"
-    "                                     global volatile unsigned int *globalvolatileunsignedintp,\n"
-    "                                     global volatile float *globalvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_scalar_restrict_p(global volatile void* restrict globalvolatilevoidrestrictp,\n"
-    "                                              global volatile char * restrict globalvolatilecharrestrictp,\n"
-    "                                              global volatile uchar*restrict globalvolatileucharrestrictp,\n"
-    "                                              global volatile unsigned char *restrict globalvolatileunsignedcharrestrictp,\n"
-    "                                              global volatile short* restrict globalvolatileshortrestrictp,\n"
-    "                                              global volatile ushort * restrict globalvolatileushortrestrictp,\n"
-    "                                              global volatile unsigned short*restrict globalvolatileunsignedshortrestrictp,\n"
-    "                                              global volatile int *restrict globalvolatileintrestrictp,\n"
-    "                                              global volatile uint* restrict globalvolatileuintrestrictp,\n"
-    "                                              global volatile unsigned int * restrict globalvolatileunsignedintrestrictp,\n"
-    "                                              global volatile float * restrict globalvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_scalar_p(global const volatile void*globalconstvolatilevoidp,\n"
-    "                                           global const volatile char *globalconstvolatilecharp,\n"
-    "                                           global const volatile uchar* globalconstvolatileucharp,\n"
-    "                                           global const volatile unsigned char * globalconstvolatileunsignedcharp,\n"
-    "                                           global const volatile short*globalconstvolatileshortp,\n"
-    "                                           global const volatile ushort *globalconstvolatileushortp,\n"
-    "                                           global const volatile unsigned short* globalconstvolatileunsignedshortp,\n"
-    "                                           global const volatile int * globalconstvolatileintp,\n"
-    "                                           global const volatile uint*globalconstvolatileuintp,\n"
-    "                                           global const volatile unsigned int *globalconstvolatileunsignedintp,\n"
-    "                                           global const volatile float *globalconstvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_scalar_restrict_p(global const volatile void* restrict globalconstvolatilevoidrestrictp,\n"
-    "                                                    global const volatile char * restrict globalconstvolatilecharrestrictp,\n"
-    "                                                    global const volatile uchar*restrict globalconstvolatileucharrestrictp,\n"
-    "                                                    global const volatile unsigned char *restrict globalconstvolatileunsignedcharrestrictp,\n"
-    "                                                    global const volatile short* restrict globalconstvolatileshortrestrictp,\n"
-    "                                                    global const volatile ushort * restrict globalconstvolatileushortrestrictp,\n"
-    "                                                    global const volatile unsigned short*restrict globalconstvolatileunsignedshortrestrictp,\n"
-    "                                                    global const volatile int *restrict globalconstvolatileintrestrictp,\n"
-    "                                                    global const volatile uint* restrict globalconstvolatileuintrestrictp,\n"
-    "                                                    global const volatile unsigned int * restrict globalconstvolatileunsignedintrestrictp,\n"
-    "                                                    global const volatile float * restrict globalconstvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_scalar_p(local void*localvoidp,\n"
-    "                           local char *localcharp,\n"
-    "                           local uchar* localucharp,\n"
-    "                           local unsigned char * localunsignedcharp,\n"
-    "                           local short*localshortp,\n"
-    "                           local ushort *localushortp,\n"
-    "                           local unsigned short* localunsignedshortp,\n"
-    "                           local int * localintp,\n"
-    "                           local uint*localuintp,\n"
-    "                           local unsigned int *localunsignedintp,\n"
-    "                           local float *localfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_scalar_restrict_p(local void* restrict localvoidrestrictp,\n"
-    "                                    local char * restrict localcharrestrictp,\n"
-    "                                    local uchar*restrict localucharrestrictp,\n"
-    "                                    local unsigned char *restrict localunsignedcharrestrictp,\n"
-    "                                    local short* restrict localshortrestrictp,\n"
-    "                                    local ushort * restrict localushortrestrictp,\n"
-    "                                    local unsigned short*restrict localunsignedshortrestrictp,\n"
-    "                                    local int *restrict localintrestrictp,\n"
-    "                                    local uint* restrict localuintrestrictp,\n"
-    "                                    local unsigned int * restrict localunsignedintrestrictp,\n"
-    "                                    local float * restrict localfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_scalar_p(local const void*localconstvoidp,\n"
-    "                                 local const char *localconstcharp,\n"
-    "                                 local const uchar* localconstucharp,\n"
-    "                                 local const unsigned char * localconstunsignedcharp,\n"
-    "                                 local const short*localconstshortp,\n"
-    "                                 local const ushort *localconstushortp,\n"
-    "                                 local const unsigned short* localconstunsignedshortp,\n"
-    "                                 local const int * localconstintp,\n"
-    "                                 local const uint*localconstuintp,\n"
-    "                                 local const unsigned int *localconstunsignedintp,\n"
-    "                                 local const float *localconstfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_scalar_restrict_p(local const void* restrict localconstvoidrestrictp,\n"
-    "                                          local const char * restrict localconstcharrestrictp,\n"
-    "                                          local const uchar*restrict localconstucharrestrictp,\n"
-    "                                          local const unsigned char *restrict localconstunsignedcharrestrictp,\n"
-    "                                          local const short* restrict localconstshortrestrictp,\n"
-    "                                          local const ushort * restrict localconstushortrestrictp,\n"
-    "                                          local const unsigned short*restrict localconstunsignedshortrestrictp,\n"
-    "                                          local const int *restrict localconstintrestrictp,\n"
-    "                                          local const uint* restrict localconstuintrestrictp,\n"
-    "                                          local const unsigned int * restrict localconstunsignedintrestrictp,\n"
-    "                                          local const float * restrict localconstfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_scalar_p(local volatile void*localvolatilevoidp,\n"
-    "                                    local volatile char *localvolatilecharp,\n"
-    "                                    local volatile uchar* localvolatileucharp,\n"
-    "                                    local volatile unsigned char * localvolatileunsignedcharp,\n"
-    "                                    local volatile short*localvolatileshortp,\n"
-    "                                    local volatile ushort *localvolatileushortp,\n"
-    "                                    local volatile unsigned short* localvolatileunsignedshortp,\n"
-    "                                    local volatile int * localvolatileintp,\n"
-    "                                    local volatile uint*localvolatileuintp,\n"
-    "                                    local volatile unsigned int *localvolatileunsignedintp,\n"
-    "                                    local volatile float *localvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_scalar_restrict_p(local volatile void* restrict localvolatilevoidrestrictp,\n"
-    "                                             local volatile char * restrict localvolatilecharrestrictp,\n"
-    "                                             local volatile uchar*restrict localvolatileucharrestrictp,\n"
-    "                                             local volatile unsigned char *restrict localvolatileunsignedcharrestrictp,\n"
-    "                                             local volatile short* restrict localvolatileshortrestrictp,\n"
-    "                                             local volatile ushort * restrict localvolatileushortrestrictp,\n"
-    "                                             local volatile unsigned short*restrict localvolatileunsignedshortrestrictp,\n"
-    "                                             local volatile int *restrict localvolatileintrestrictp,\n"
-    "                                             local volatile uint* restrict localvolatileuintrestrictp,\n"
-    "                                             local volatile unsigned int * restrict localvolatileunsignedintrestrictp,\n"
-    "                                             local volatile float * restrict localvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_scalar_p(local const volatile void*localconstvolatilevoidp,\n"
-    "                                          local const volatile char *localconstvolatilecharp,\n"
-    "                                          local const volatile uchar* localconstvolatileucharp,\n"
-    "                                          local const volatile unsigned char * localconstvolatileunsignedcharp,\n"
-    "                                          local const volatile short*localconstvolatileshortp,\n"
-    "                                          local const volatile ushort *localconstvolatileushortp,\n"
-    "                                          local const volatile unsigned short* localconstvolatileunsignedshortp,\n"
-    "                                          local const volatile int * localconstvolatileintp,\n"
-    "                                          local const volatile uint*localconstvolatileuintp,\n"
-    "                                          local const volatile unsigned int *localconstvolatileunsignedintp,\n"
-    "                                          local const volatile float *localconstvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_scalar_restrict_p(local const volatile void* restrict localconstvolatilevoidrestrictp,\n"
-    "                                                   local const volatile char * restrict localconstvolatilecharrestrictp,\n"
-    "                                                   local const volatile uchar*restrict localconstvolatileucharrestrictp,\n"
-    "                                                   local const volatile unsigned char *restrict localconstvolatileunsignedcharrestrictp,\n"
-    "                                                   local const volatile short* restrict localconstvolatileshortrestrictp,\n"
-    "                                                   local const volatile ushort * restrict localconstvolatileushortrestrictp,\n"
-    "                                                   local const volatile unsigned short*restrict localconstvolatileunsignedshortrestrictp,\n"
-    "                                                   local const volatile int *restrict localconstvolatileintrestrictp,\n"
-    "                                                   local const volatile uint* restrict localconstvolatileuintrestrictp,\n"
-    "                                                   local const volatile unsigned int * restrict localconstvolatileunsignedintrestrictp,\n"
-    "                                                   local const volatile float * restrict localconstvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void scalar_d(char chard,\n"
-    "                     uchar uchard,\n"
-    "                     unsigned char unsignedchard,\n"
-    "                     short shortd,\n"
-    "                     ushort ushortd,\n"
-    "                     unsigned short unsignedshortd,\n"
-    "                     int intd,\n"
-    "                     uint uintd,\n"
-    "                     unsigned int unsignedintd,\n"
-    "                     float floatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_scalar_d(const char constchard,\n"
-    "                           const uchar constuchard,\n"
-    "                           const unsigned char constunsignedchard,\n"
-    "                           const short constshortd,\n"
-    "                           const ushort constushortd,\n"
-    "                           const unsigned short constunsignedshortd,\n"
-    "                           const int constintd,\n"
-    "                           const uint constuintd,\n"
-    "                           const unsigned int constunsignedintd,\n"
-    "                           const float constfloatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_scalar_d(private char privatechard,\n"
-    "                             private uchar privateuchard,\n"
-    "                             private unsigned char privateunsignedchard,\n"
-    "                             private short privateshortd,\n"
-    "                             private ushort privateushortd,\n"
-    "                             private unsigned short privateunsignedshortd,\n"
-    "                             private int privateintd,\n"
-    "                             private uint privateuintd,\n"
-    "                             private unsigned int privateunsignedintd,\n"
-    "                             private float privatefloatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_scalar_d(private const char privateconstchard,\n"
-    "                                   private const uchar privateconstuchard,\n"
-    "                                   private const unsigned char privateconstunsignedchard,\n"
-    "                                   private const short privateconstshortd,\n"
-    "                                   private const ushort privateconstushortd,\n"
-    "                                   private const unsigned short privateconstunsignedshortd,\n"
-    "                                   private const int privateconstintd,\n"
-    "                                   private const uint privateconstuintd,\n"
-    "                                   private const unsigned int privateconstunsignedintd,\n"
-    "                                   private const float privateconstfloatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector2_p0(constant char2*constantchar2p,\n"
-    "                               constant uchar2 *constantuchar2p,\n"
-    "                               constant short2* constantshort2p,\n"
-    "                               constant ushort2 * constantushort2p)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_p1(constant int2*constantint2p,\n"
-    "                               constant uint2 *constantuint2p)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_p2(constant float2*constantfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector2_restrict_p0(constant char2 *restrict constantchar2restrictp,\n"
-    "                                        constant uchar2* restrict constantuchar2restrictp,\n"
-    "                                        constant short2 * restrict constantshort2restrictp,\n"
-    "                                        constant ushort2*restrict constantushort2restrictp)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_restrict_p1(constant int2 *restrict constantint2restrictp,\n"
-    "                                        constant uint2* restrict constantuint2restrictp)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_restrict_p2(constant float2 *restrict constantfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector2_p(global char2*globalchar2p,\n"
-    "                             global uchar2 *globaluchar2p,\n"
-    "                             global short2* globalshort2p,\n"
-    "                             global ushort2 * globalushort2p,\n"
-    "                             global int2*globalint2p,\n"
-    "                             global uint2 *globaluint2p,\n"
-    "                             global float2*globalfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector2_restrict_p(global char2 *restrict globalchar2restrictp,\n"
-    "                                      global uchar2* restrict globaluchar2restrictp,\n"
-    "                                      global short2 * restrict globalshort2restrictp,\n"
-    "                                      global ushort2*restrict globalushort2restrictp,\n"
-    "                                      global int2 *restrict globalint2restrictp,\n"
-    "                                      global uint2* restrict globaluint2restrictp,\n"
-    "                                      global float2 *restrict globalfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector2_p(global const char2* globalconstchar2p,\n"
-    "                                   global const uchar2 * globalconstuchar2p,\n"
-    "                                   global const short2*globalconstshort2p,\n"
-    "                                   global const ushort2 *globalconstushort2p,\n"
-    "                                   global const int2* globalconstint2p,\n"
-    "                                   global const uint2 * globalconstuint2p,\n"
-    "                                   global const float2* globalconstfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector2_restrict_p(global const char2 * restrict globalconstchar2restrictp,\n"
-    "                                            global const uchar2*restrict globalconstuchar2restrictp,\n"
-    "                                            global const short2 *restrict globalconstshort2restrictp,\n"
-    "                                            global const ushort2* restrict globalconstushort2restrictp,\n"
-    "                                            global const int2 * restrict globalconstint2restrictp,\n"
-    "                                            global const uint2*restrict globalconstuint2restrictp,\n"
-    "                                            global const float2 * restrict globalconstfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector2_p(global volatile char2*globalvolatilechar2p,\n"
-    "                                      global volatile uchar2 *globalvolatileuchar2p,\n"
-    "                                      global volatile short2* globalvolatileshort2p,\n"
-    "                                      global volatile ushort2 * globalvolatileushort2p,\n"
-    "                                      global volatile int2*globalvolatileint2p,\n"
-    "                                      global volatile uint2 *globalvolatileuint2p,\n"
-    "                                      global volatile float2*globalvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector2_restrict_p(global volatile char2 *restrict globalvolatilechar2restrictp,\n"
-    "                                               global volatile uchar2* restrict globalvolatileuchar2restrictp,\n"
-    "                                               global volatile short2 * restrict globalvolatileshort2restrictp,\n"
-    "                                               global volatile ushort2*restrict globalvolatileushort2restrictp,\n"
-    "                                               global volatile int2 *restrict globalvolatileint2restrictp,\n"
-    "                                               global volatile uint2* restrict globalvolatileuint2restrictp,\n"
-    "                                               global volatile float2 *restrict globalvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector2_p(global const volatile char2* globalconstvolatilechar2p,\n"
-    "                                            global const volatile uchar2 * globalconstvolatileuchar2p,\n"
-    "                                            global const volatile short2*globalconstvolatileshort2p,\n"
-    "                                            global const volatile ushort2 *globalconstvolatileushort2p,\n"
-    "                                            global const volatile int2* globalconstvolatileint2p,\n"
-    "                                            global const volatile uint2 * globalconstvolatileuint2p,\n"
-    "                                            global const volatile float2* globalconstvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector2_restrict_p(global const volatile char2 * restrict globalconstvolatilechar2restrictp,\n"
-    "                                                     global const volatile uchar2*restrict globalconstvolatileuchar2restrictp,\n"
-    "                                                     global const volatile short2 *restrict globalconstvolatileshort2restrictp,\n"
-    "                                                     global const volatile ushort2* restrict globalconstvolatileushort2restrictp,\n"
-    "                                                     global const volatile int2 * restrict globalconstvolatileint2restrictp,\n"
-    "                                                     global const volatile uint2*restrict globalconstvolatileuint2restrictp,\n"
-    "                                                     global const volatile float2 * restrict globalconstvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector2_p(local char2*localchar2p,\n"
-    "                            local uchar2 *localuchar2p,\n"
-    "                            local short2* localshort2p,\n"
-    "                            local ushort2 * localushort2p,\n"
-    "                            local int2*localint2p,\n"
-    "                            local uint2 *localuint2p,\n"
-    "                            local float2*localfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector2_restrict_p(local char2 *restrict localchar2restrictp,\n"
-    "                                     local uchar2* restrict localuchar2restrictp,\n"
-    "                                     local short2 * restrict localshort2restrictp,\n"
-    "                                     local ushort2*restrict localushort2restrictp,\n"
-    "                                     local int2 *restrict localint2restrictp,\n"
-    "                                     local uint2* restrict localuint2restrictp,\n"
-    "                                     local float2 *restrict localfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector2_p(local const char2* localconstchar2p,\n"
-    "                                  local const uchar2 * localconstuchar2p,\n"
-    "                                  local const short2*localconstshort2p,\n"
-    "                                  local const ushort2 *localconstushort2p,\n"
-    "                                  local const int2* localconstint2p,\n"
-    "                                  local const uint2 * localconstuint2p,\n"
-    "                                  local const float2* localconstfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector2_restrict_p(local const char2 * restrict localconstchar2restrictp,\n"
-    "                                           local const uchar2*restrict localconstuchar2restrictp,\n"
-    "                                           local const short2 *restrict localconstshort2restrictp,\n"
-    "                                           local const ushort2* restrict localconstushort2restrictp,\n"
-    "                                           local const int2 * restrict localconstint2restrictp,\n"
-    "                                           local const uint2*restrict localconstuint2restrictp,\n"
-    "                                           local const float2 * restrict localconstfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector2_p(local volatile char2*localvolatilechar2p,\n"
-    "                                     local volatile uchar2 *localvolatileuchar2p,\n"
-    "                                     local volatile short2* localvolatileshort2p,\n"
-    "                                     local volatile ushort2 * localvolatileushort2p,\n"
-    "                                     local volatile int2*localvolatileint2p,\n"
-    "                                     local volatile uint2 *localvolatileuint2p,\n"
-    "                                     local volatile float2*localvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector2_restrict_p(local volatile char2 *restrict localvolatilechar2restrictp,\n"
-    "                                              local volatile uchar2* restrict localvolatileuchar2restrictp,\n"
-    "                                              local volatile short2 * restrict localvolatileshort2restrictp,\n"
-    "                                              local volatile ushort2*restrict localvolatileushort2restrictp,\n"
-    "                                              local volatile int2 *restrict localvolatileint2restrictp,\n"
-    "                                              local volatile uint2* restrict localvolatileuint2restrictp,\n"
-    "                                              local volatile float2 *restrict localvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector2_p(local const volatile char2* localconstvolatilechar2p,\n"
-    "                                           local const volatile uchar2 * localconstvolatileuchar2p,\n"
-    "                                           local const volatile short2*localconstvolatileshort2p,\n"
-    "                                           local const volatile ushort2 *localconstvolatileushort2p,\n"
-    "                                           local const volatile int2* localconstvolatileint2p,\n"
-    "                                           local const volatile uint2 * localconstvolatileuint2p,\n"
-    "                                           local const volatile float2* localconstvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector2_restrict_p(local const volatile char2 * restrict localconstvolatilechar2restrictp,\n"
-    "                                                    local const volatile uchar2*restrict localconstvolatileuchar2restrictp,\n"
-    "                                                    local const volatile short2 *restrict localconstvolatileshort2restrictp,\n"
-    "                                                    local const volatile ushort2* restrict localconstvolatileushort2restrictp,\n"
-    "                                                    local const volatile int2 * restrict localconstvolatileint2restrictp,\n"
-    "                                                    local const volatile uint2*restrict localconstvolatileuint2restrictp,\n"
-    "                                                    local const volatile float2 * restrict localconstvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector2_d(char2 char2d,\n"
-    "                      uchar2 uchar2d,\n"
-    "                      short2 short2d,\n"
-    "                      ushort2 ushort2d,\n"
-    "                      int2 int2d,\n"
-    "                      uint2 uint2d,\n"
-    "                      float2 float2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector2_d(const char2 constchar2d,\n"
-    "                            const uchar2 constuchar2d,\n"
-    "                            const short2 constshort2d,\n"
-    "                            const ushort2 constushort2d,\n"
-    "                            const int2 constint2d,\n"
-    "                            const uint2 constuint2d,\n"
-    "                            const float2 constfloat2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector2_d(private char2 privatechar2d,\n"
-    "                              private uchar2 privateuchar2d,\n"
-    "                              private short2 privateshort2d,\n"
-    "                              private ushort2 privateushort2d,\n"
-    "                              private int2 privateint2d,\n"
-    "                              private uint2 privateuint2d,\n"
-    "                              private float2 privatefloat2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector2_d(private const char2 privateconstchar2d,\n"
-    "                                    private const uchar2 privateconstuchar2d,\n"
-    "                                    private const short2 privateconstshort2d,\n"
-    "                                    private const ushort2 privateconstushort2d,\n"
-    "                                    private const int2 privateconstint2d,\n"
-    "                                    private const uint2 privateconstuint2d,\n"
-    "                                    private const float2 privateconstfloat2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_p0(constant char3*constantchar3p,\n"
-    "                               constant uchar3 *constantuchar3p,\n"
-    "                               constant short3* constantshort3p,\n"
-    "                               constant ushort3 * constantushort3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_p1(constant int3*constantint3p,\n"
-    "                               constant uint3 *constantuint3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_p2(constant float3*constantfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_restrict_p0(constant char3 *restrict constantchar3restrictp,\n"
-    "                                        constant uchar3* restrict constantuchar3restrictp,\n"
-    "                                        constant short3 * restrict constantshort3restrictp,\n"
-    "                                        constant ushort3*restrict constantushort3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_restrict_p1(constant int3 *restrict constantint3restrictp,\n"
-    "                                        constant uint3* restrict constantuint3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_restrict_p2(constant float3 *restrict constantfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector3_p(global char3*globalchar3p,\n"
-    "                             global uchar3 *globaluchar3p,\n"
-    "                             global short3* globalshort3p,\n"
-    "                             global ushort3 * globalushort3p,\n"
-    "                             global int3*globalint3p,\n"
-    "                             global uint3 *globaluint3p,\n"
-    "                             global float3*globalfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector3_restrict_p(global char3 *restrict globalchar3restrictp,\n"
-    "                                      global uchar3* restrict globaluchar3restrictp,\n"
-    "                                      global short3 * restrict globalshort3restrictp,\n"
-    "                                      global ushort3*restrict globalushort3restrictp,\n"
-    "                                      global int3 *restrict globalint3restrictp,\n"
-    "                                      global uint3* restrict globaluint3restrictp,\n"
-    "                                      global float3 *restrict globalfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector3_p(global const char3* globalconstchar3p,\n"
-    "                                   global const uchar3 * globalconstuchar3p,\n"
-    "                                   global const short3*globalconstshort3p,\n"
-    "                                   global const ushort3 *globalconstushort3p,\n"
-    "                                   global const int3* globalconstint3p,\n"
-    "                                   global const uint3 * globalconstuint3p,\n"
-    "                                   global const float3* globalconstfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector3_restrict_p(global const char3 * restrict globalconstchar3restrictp,\n"
-    "                                            global const uchar3*restrict globalconstuchar3restrictp,\n"
-    "                                            global const short3 *restrict globalconstshort3restrictp,\n"
-    "                                            global const ushort3* restrict globalconstushort3restrictp,\n"
-    "                                            global const int3 * restrict globalconstint3restrictp,\n"
-    "                                            global const uint3*restrict globalconstuint3restrictp,\n"
-    "                                            global const float3 * restrict globalconstfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector3_p(global volatile char3*globalvolatilechar3p,\n"
-    "                                      global volatile uchar3 *globalvolatileuchar3p,\n"
-    "                                      global volatile short3* globalvolatileshort3p,\n"
-    "                                      global volatile ushort3 * globalvolatileushort3p,\n"
-    "                                      global volatile int3*globalvolatileint3p,\n"
-    "                                      global volatile uint3 *globalvolatileuint3p,\n"
-    "                                      global volatile float3*globalvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector3_restrict_p(global volatile char3 *restrict globalvolatilechar3restrictp,\n"
-    "                                               global volatile uchar3* restrict globalvolatileuchar3restrictp,\n"
-    "                                               global volatile short3 * restrict globalvolatileshort3restrictp,\n"
-    "                                               global volatile ushort3*restrict globalvolatileushort3restrictp,\n"
-    "                                               global volatile int3 *restrict globalvolatileint3restrictp,\n"
-    "                                               global volatile uint3* restrict globalvolatileuint3restrictp,\n"
-    "                                               global volatile float3 *restrict globalvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector3_p(global const volatile char3* globalconstvolatilechar3p,\n"
-    "                                            global const volatile uchar3 * globalconstvolatileuchar3p,\n"
-    "                                            global const volatile short3*globalconstvolatileshort3p,\n"
-    "                                            global const volatile ushort3 *globalconstvolatileushort3p,\n"
-    "                                            global const volatile int3* globalconstvolatileint3p,\n"
-    "                                            global const volatile uint3 * globalconstvolatileuint3p,\n"
-    "                                            global const volatile float3* globalconstvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector3_restrict_p(global const volatile char3 * restrict globalconstvolatilechar3restrictp,\n"
-    "                                                     global const volatile uchar3*restrict globalconstvolatileuchar3restrictp,\n"
-    "                                                     global const volatile short3 *restrict globalconstvolatileshort3restrictp,\n"
-    "                                                     global const volatile ushort3* restrict globalconstvolatileushort3restrictp,\n"
-    "                                                     global const volatile int3 * restrict globalconstvolatileint3restrictp,\n"
-    "                                                     global const volatile uint3*restrict globalconstvolatileuint3restrictp,\n"
-    "                                                     global const volatile float3 * restrict globalconstvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector3_p(local char3*localchar3p,\n"
-    "                            local uchar3 *localuchar3p,\n"
-    "                            local short3* localshort3p,\n"
-    "                            local ushort3 * localushort3p,\n"
-    "                            local int3*localint3p,\n"
-    "                            local uint3 *localuint3p,\n"
-    "                            local float3*localfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector3_restrict_p(local char3 *restrict localchar3restrictp,\n"
-    "                                     local uchar3* restrict localuchar3restrictp,\n"
-    "                                     local short3 * restrict localshort3restrictp,\n"
-    "                                     local ushort3*restrict localushort3restrictp,\n"
-    "                                     local int3 *restrict localint3restrictp,\n"
-    "                                     local uint3* restrict localuint3restrictp,\n"
-    "                                     local float3 *restrict localfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector3_p(local const char3* localconstchar3p,\n"
-    "                                  local const uchar3 * localconstuchar3p,\n"
-    "                                  local const short3*localconstshort3p,\n"
-    "                                  local const ushort3 *localconstushort3p,\n"
-    "                                  local const int3* localconstint3p,\n"
-    "                                  local const uint3 * localconstuint3p,\n"
-    "                                  local const float3* localconstfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector3_restrict_p(local const char3 * restrict localconstchar3restrictp,\n"
-    "                                           local const uchar3*restrict localconstuchar3restrictp,\n"
-    "                                           local const short3 *restrict localconstshort3restrictp,\n"
-    "                                           local const ushort3* restrict localconstushort3restrictp,\n"
-    "                                           local const int3 * restrict localconstint3restrictp,\n"
-    "                                           local const uint3*restrict localconstuint3restrictp,\n"
-    "                                           local const float3 * restrict localconstfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector3_p(local volatile char3*localvolatilechar3p,\n"
-    "                                     local volatile uchar3 *localvolatileuchar3p,\n"
-    "                                     local volatile short3* localvolatileshort3p,\n"
-    "                                     local volatile ushort3 * localvolatileushort3p,\n"
-    "                                     local volatile int3*localvolatileint3p,\n"
-    "                                     local volatile uint3 *localvolatileuint3p,\n"
-    "                                     local volatile float3*localvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector3_restrict_p(local volatile char3 *restrict localvolatilechar3restrictp,\n"
-    "                                              local volatile uchar3* restrict localvolatileuchar3restrictp,\n"
-    "                                              local volatile short3 * restrict localvolatileshort3restrictp,\n"
-    "                                              local volatile ushort3*restrict localvolatileushort3restrictp,\n"
-    "                                              local volatile int3 *restrict localvolatileint3restrictp,\n"
-    "                                              local volatile uint3* restrict localvolatileuint3restrictp,\n"
-    "                                              local volatile float3 *restrict localvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector3_p(local const volatile char3* localconstvolatilechar3p,\n"
-    "                                           local const volatile uchar3 * localconstvolatileuchar3p,\n"
-    "                                           local const volatile short3*localconstvolatileshort3p,\n"
-    "                                           local const volatile ushort3 *localconstvolatileushort3p,\n"
-    "                                           local const volatile int3* localconstvolatileint3p,\n"
-    "                                           local const volatile uint3 * localconstvolatileuint3p,\n"
-    "                                           local const volatile float3* localconstvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector3_restrict_p(local const volatile char3 * restrict localconstvolatilechar3restrictp,\n"
-    "                                                    local const volatile uchar3*restrict localconstvolatileuchar3restrictp,\n"
-    "                                                    local const volatile short3 *restrict localconstvolatileshort3restrictp,\n"
-    "                                                    local const volatile ushort3* restrict localconstvolatileushort3restrictp,\n"
-    "                                                    local const volatile int3 * restrict localconstvolatileint3restrictp,\n"
-    "                                                    local const volatile uint3*restrict localconstvolatileuint3restrictp,\n"
-    "                                                    local const volatile float3 * restrict localconstvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector3_d(char3 char3d,\n"
-    "                      uchar3 uchar3d,\n"
-    "                      short3 short3d,\n"
-    "                      ushort3 ushort3d,\n"
-    "                      int3 int3d,\n"
-    "                      uint3 uint3d,\n"
-    "                      float3 float3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector3_d(const char3 constchar3d,\n"
-    "                            const uchar3 constuchar3d,\n"
-    "                            const short3 constshort3d,\n"
-    "                            const ushort3 constushort3d,\n"
-    "                            const int3 constint3d,\n"
-    "                            const uint3 constuint3d,\n"
-    "                            const float3 constfloat3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector3_d(private char3 privatechar3d,\n"
-    "                              private uchar3 privateuchar3d,\n"
-    "                              private short3 privateshort3d,\n"
-    "                              private ushort3 privateushort3d,\n"
-    "                              private int3 privateint3d,\n"
-    "                              private uint3 privateuint3d,\n"
-    "                              private float3 privatefloat3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector3_d(private const char3 privateconstchar3d,\n"
-    "                                    private const uchar3 privateconstuchar3d,\n"
-    "                                    private const short3 privateconstshort3d,\n"
-    "                                    private const ushort3 privateconstushort3d,\n"
-    "                                    private const int3 privateconstint3d,\n"
-    "                                    private const uint3 privateconstuint3d,\n"
-    "                                    private const float3 privateconstfloat3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_p0(constant char4*constantchar4p,\n"
-    "                               constant uchar4 *constantuchar4p,\n"
-    "                               constant short4* constantshort4p,\n"
-    "                               constant ushort4 * constantushort4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_p1(constant int4*constantint4p,\n"
-    "                               constant uint4 *constantuint4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_p2(constant float4*constantfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_restrict_p0(constant char4 *restrict constantchar4restrictp,\n"
-    "                                        constant uchar4* restrict constantuchar4restrictp,\n"
-    "                                        constant short4 * restrict constantshort4restrictp,\n"
-    "                                        constant ushort4*restrict constantushort4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_restrict_p1(constant int4 *restrict constantint4restrictp,\n"
-    "                                        constant uint4* restrict constantuint4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_restrict_p2(constant float4 *restrict constantfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector4_p(global char4*globalchar4p,\n"
-    "                             global uchar4 *globaluchar4p,\n"
-    "                             global short4* globalshort4p,\n"
-    "                             global ushort4 * globalushort4p,\n"
-    "                             global int4*globalint4p,\n"
-    "                             global uint4 *globaluint4p,\n"
-    "                             global float4*globalfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector4_restrict_p(global char4 *restrict globalchar4restrictp,\n"
-    "                                      global uchar4* restrict globaluchar4restrictp,\n"
-    "                                      global short4 * restrict globalshort4restrictp,\n"
-    "                                      global ushort4*restrict globalushort4restrictp,\n"
-    "                                      global int4 *restrict globalint4restrictp,\n"
-    "                                      global uint4* restrict globaluint4restrictp,\n"
-    "                                      global float4 *restrict globalfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector4_p(global const char4* globalconstchar4p,\n"
-    "                                   global const uchar4 * globalconstuchar4p,\n"
-    "                                   global const short4*globalconstshort4p,\n"
-    "                                   global const ushort4 *globalconstushort4p,\n"
-    "                                   global const int4* globalconstint4p,\n"
-    "                                   global const uint4 * globalconstuint4p,\n"
-    "                                   global const float4* globalconstfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector4_restrict_p(global const char4 * restrict globalconstchar4restrictp,\n"
-    "                                            global const uchar4*restrict globalconstuchar4restrictp,\n"
-    "                                            global const short4 *restrict globalconstshort4restrictp,\n"
-    "                                            global const ushort4* restrict globalconstushort4restrictp,\n"
-    "                                            global const int4 * restrict globalconstint4restrictp,\n"
-    "                                            global const uint4*restrict globalconstuint4restrictp,\n"
-    "                                            global const float4 * restrict globalconstfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector4_p(global volatile char4*globalvolatilechar4p,\n"
-    "                                      global volatile uchar4 *globalvolatileuchar4p,\n"
-    "                                      global volatile short4* globalvolatileshort4p,\n"
-    "                                      global volatile ushort4 * globalvolatileushort4p,\n"
-    "                                      global volatile int4*globalvolatileint4p,\n"
-    "                                      global volatile uint4 *globalvolatileuint4p,\n"
-    "                                      global volatile float4*globalvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector4_restrict_p(global volatile char4 *restrict globalvolatilechar4restrictp,\n"
-    "                                               global volatile uchar4* restrict globalvolatileuchar4restrictp,\n"
-    "                                               global volatile short4 * restrict globalvolatileshort4restrictp,\n"
-    "                                               global volatile ushort4*restrict globalvolatileushort4restrictp,\n"
-    "                                               global volatile int4 *restrict globalvolatileint4restrictp,\n"
-    "                                               global volatile uint4* restrict globalvolatileuint4restrictp,\n"
-    "                                               global volatile float4 *restrict globalvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector4_p(global const volatile char4* globalconstvolatilechar4p,\n"
-    "                                            global const volatile uchar4 * globalconstvolatileuchar4p,\n"
-    "                                            global const volatile short4*globalconstvolatileshort4p,\n"
-    "                                            global const volatile ushort4 *globalconstvolatileushort4p,\n"
-    "                                            global const volatile int4* globalconstvolatileint4p,\n"
-    "                                            global const volatile uint4 * globalconstvolatileuint4p,\n"
-    "                                            global const volatile float4* globalconstvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector4_restrict_p(global const volatile char4 * restrict globalconstvolatilechar4restrictp,\n"
-    "                                                     global const volatile uchar4*restrict globalconstvolatileuchar4restrictp,\n"
-    "                                                     global const volatile short4 *restrict globalconstvolatileshort4restrictp,\n"
-    "                                                     global const volatile ushort4* restrict globalconstvolatileushort4restrictp,\n"
-    "                                                     global const volatile int4 * restrict globalconstvolatileint4restrictp,\n"
-    "                                                     global const volatile uint4*restrict globalconstvolatileuint4restrictp,\n"
-    "                                                     global const volatile float4 * restrict globalconstvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector4_p(local char4*localchar4p,\n"
-    "                            local uchar4 *localuchar4p,\n"
-    "                            local short4* localshort4p,\n"
-    "                            local ushort4 * localushort4p,\n"
-    "                            local int4*localint4p,\n"
-    "                            local uint4 *localuint4p,\n"
-    "                            local float4*localfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector4_restrict_p(local char4 *restrict localchar4restrictp,\n"
-    "                                     local uchar4* restrict localuchar4restrictp,\n"
-    "                                     local short4 * restrict localshort4restrictp,\n"
-    "                                     local ushort4*restrict localushort4restrictp,\n"
-    "                                     local int4 *restrict localint4restrictp,\n"
-    "                                     local uint4* restrict localuint4restrictp,\n"
-    "                                     local float4 *restrict localfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector4_p(local const char4* localconstchar4p,\n"
-    "                                  local const uchar4 * localconstuchar4p,\n"
-    "                                  local const short4*localconstshort4p,\n"
-    "                                  local const ushort4 *localconstushort4p,\n"
-    "                                  local const int4* localconstint4p,\n"
-    "                                  local const uint4 * localconstuint4p,\n"
-    "                                  local const float4* localconstfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector4_restrict_p(local const char4 * restrict localconstchar4restrictp,\n"
-    "                                           local const uchar4*restrict localconstuchar4restrictp,\n"
-    "                                           local const short4 *restrict localconstshort4restrictp,\n"
-    "                                           local const ushort4* restrict localconstushort4restrictp,\n"
-    "                                           local const int4 * restrict localconstint4restrictp,\n"
-    "                                           local const uint4*restrict localconstuint4restrictp,\n"
-    "                                           local const float4 * restrict localconstfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector4_p(local volatile char4*localvolatilechar4p,\n"
-    "                                     local volatile uchar4 *localvolatileuchar4p,\n"
-    "                                     local volatile short4* localvolatileshort4p,\n"
-    "                                     local volatile ushort4 * localvolatileushort4p,\n"
-    "                                     local volatile int4*localvolatileint4p,\n"
-    "                                     local volatile uint4 *localvolatileuint4p,\n"
-    "                                     local volatile float4*localvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector4_restrict_p(local volatile char4 *restrict localvolatilechar4restrictp,\n"
-    "                                              local volatile uchar4* restrict localvolatileuchar4restrictp,\n"
-    "                                              local volatile short4 * restrict localvolatileshort4restrictp,\n"
-    "                                              local volatile ushort4*restrict localvolatileushort4restrictp,\n"
-    "                                              local volatile int4 *restrict localvolatileint4restrictp,\n"
-    "                                              local volatile uint4* restrict localvolatileuint4restrictp,\n"
-    "                                              local volatile float4 *restrict localvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector4_p(local const volatile char4* localconstvolatilechar4p,\n"
-    "                                           local const volatile uchar4 * localconstvolatileuchar4p,\n"
-    "                                           local const volatile short4*localconstvolatileshort4p,\n"
-    "                                           local const volatile ushort4 *localconstvolatileushort4p,\n"
-    "                                           local const volatile int4* localconstvolatileint4p,\n"
-    "                                           local const volatile uint4 * localconstvolatileuint4p,\n"
-    "                                           local const volatile float4* localconstvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector4_restrict_p(local const volatile char4 * restrict localconstvolatilechar4restrictp,\n"
-    "                                                    local const volatile uchar4*restrict localconstvolatileuchar4restrictp,\n"
-    "                                                    local const volatile short4 *restrict localconstvolatileshort4restrictp,\n"
-    "                                                    local const volatile ushort4* restrict localconstvolatileushort4restrictp,\n"
-    "                                                    local const volatile int4 * restrict localconstvolatileint4restrictp,\n"
-    "                                                    local const volatile uint4*restrict localconstvolatileuint4restrictp,\n"
-    "                                                    local const volatile float4 * restrict localconstvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector4_d(char4 char4d,\n"
-    "                      uchar4 uchar4d,\n"
-    "                      short4 short4d,\n"
-    "                      ushort4 ushort4d,\n"
-    "                      int4 int4d,\n"
-    "                      uint4 uint4d,\n"
-    "                      float4 float4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector4_d(const char4 constchar4d,\n"
-    "                            const uchar4 constuchar4d,\n"
-    "                            const short4 constshort4d,\n"
-    "                            const ushort4 constushort4d,\n"
-    "                            const int4 constint4d,\n"
-    "                            const uint4 constuint4d,\n"
-    "                            const float4 constfloat4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector4_d(private char4 privatechar4d,\n"
-    "                              private uchar4 privateuchar4d,\n"
-    "                              private short4 privateshort4d,\n"
-    "                              private ushort4 privateushort4d,\n"
-    "                              private int4 privateint4d,\n"
-    "                              private uint4 privateuint4d,\n"
-    "                              private float4 privatefloat4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector4_d(private const char4 privateconstchar4d,\n"
-    "                                    private const uchar4 privateconstuchar4d,\n"
-    "                                    private const short4 privateconstshort4d,\n"
-    "                                    private const ushort4 privateconstushort4d,\n"
-    "                                    private const int4 privateconstint4d,\n"
-    "                                    private const uint4 privateconstuint4d,\n"
-    "                                    private const float4 privateconstfloat4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_p0(constant char8*constantchar8p,\n"
-    "                               constant uchar8 *constantuchar8p,\n"
-    "                               constant short8* constantshort8p,\n"
-    "                               constant ushort8 * constantushort8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_p1(constant int8*constantint8p,\n"
-    "                               constant uint8 *constantuint8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_p2(constant float8*constantfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_restrict_p0(constant char8 *restrict constantchar8restrictp,\n"
-    "                                        constant uchar8* restrict constantuchar8restrictp,\n"
-    "                                        constant short8 * restrict constantshort8restrictp,\n"
-    "                                        constant ushort8*restrict constantushort8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_restrict_p1(constant int8 *restrict constantint8restrictp,\n"
-    "                                        constant uint8* restrict constantuint8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_restrict_p2(constant float8 *restrict constantfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector8_p(global char8*globalchar8p,\n"
-    "                             global uchar8 *globaluchar8p,\n"
-    "                             global short8* globalshort8p,\n"
-    "                             global ushort8 * globalushort8p,\n"
-    "                             global int8*globalint8p,\n"
-    "                             global uint8 *globaluint8p,\n"
-    "                             global float8*globalfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector8_restrict_p(global char8 *restrict globalchar8restrictp,\n"
-    "                                      global uchar8* restrict globaluchar8restrictp,\n"
-    "                                      global short8 * restrict globalshort8restrictp,\n"
-    "                                      global ushort8*restrict globalushort8restrictp,\n"
-    "                                      global int8 *restrict globalint8restrictp,\n"
-    "                                      global uint8* restrict globaluint8restrictp,\n"
-    "                                      global float8 *restrict globalfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector8_p(global const char8* globalconstchar8p,\n"
-    "                                   global const uchar8 * globalconstuchar8p,\n"
-    "                                   global const short8*globalconstshort8p,\n"
-    "                                   global const ushort8 *globalconstushort8p,\n"
-    "                                   global const int8* globalconstint8p,\n"
-    "                                   global const uint8 * globalconstuint8p,\n"
-    "                                   global const float8* globalconstfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector8_restrict_p(global const char8 * restrict globalconstchar8restrictp,\n"
-    "                                            global const uchar8*restrict globalconstuchar8restrictp,\n"
-    "                                            global const short8 *restrict globalconstshort8restrictp,\n"
-    "                                            global const ushort8* restrict globalconstushort8restrictp,\n"
-    "                                            global const int8 * restrict globalconstint8restrictp,\n"
-    "                                            global const uint8*restrict globalconstuint8restrictp,\n"
-    "                                            global const float8 * restrict globalconstfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector8_p(global volatile char8*globalvolatilechar8p,\n"
-    "                                      global volatile uchar8 *globalvolatileuchar8p,\n"
-    "                                      global volatile short8* globalvolatileshort8p,\n"
-    "                                      global volatile ushort8 * globalvolatileushort8p,\n"
-    "                                      global volatile int8*globalvolatileint8p,\n"
-    "                                      global volatile uint8 *globalvolatileuint8p,\n"
-    "                                      global volatile float8*globalvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector8_restrict_p(global volatile char8 *restrict globalvolatilechar8restrictp,\n"
-    "                                               global volatile uchar8* restrict globalvolatileuchar8restrictp,\n"
-    "                                               global volatile short8 * restrict globalvolatileshort8restrictp,\n"
-    "                                               global volatile ushort8*restrict globalvolatileushort8restrictp,\n"
-    "                                               global volatile int8 *restrict globalvolatileint8restrictp,\n"
-    "                                               global volatile uint8* restrict globalvolatileuint8restrictp,\n"
-    "                                               global volatile float8 *restrict globalvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector8_p(global const volatile char8* globalconstvolatilechar8p,\n"
-    "                                            global const volatile uchar8 * globalconstvolatileuchar8p,\n"
-    "                                            global const volatile short8*globalconstvolatileshort8p,\n"
-    "                                            global const volatile ushort8 *globalconstvolatileushort8p,\n"
-    "                                            global const volatile int8* globalconstvolatileint8p,\n"
-    "                                            global const volatile uint8 * globalconstvolatileuint8p,\n"
-    "                                            global const volatile float8* globalconstvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector8_restrict_p(global const volatile char8 * restrict globalconstvolatilechar8restrictp,\n"
-    "                                                     global const volatile uchar8*restrict globalconstvolatileuchar8restrictp,\n"
-    "                                                     global const volatile short8 *restrict globalconstvolatileshort8restrictp,\n"
-    "                                                     global const volatile ushort8* restrict globalconstvolatileushort8restrictp,\n"
-    "                                                     global const volatile int8 * restrict globalconstvolatileint8restrictp,\n"
-    "                                                     global const volatile uint8*restrict globalconstvolatileuint8restrictp,\n"
-    "                                                     global const volatile float8 * restrict globalconstvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector8_p(local char8*localchar8p,\n"
-    "                            local uchar8 *localuchar8p,\n"
-    "                            local short8* localshort8p,\n"
-    "                            local ushort8 * localushort8p,\n"
-    "                            local int8*localint8p,\n"
-    "                            local uint8 *localuint8p,\n"
-    "                            local float8*localfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector8_restrict_p(local char8 *restrict localchar8restrictp,\n"
-    "                                     local uchar8* restrict localuchar8restrictp,\n"
-    "                                     local short8 * restrict localshort8restrictp,\n"
-    "                                     local ushort8*restrict localushort8restrictp,\n"
-    "                                     local int8 *restrict localint8restrictp,\n"
-    "                                     local uint8* restrict localuint8restrictp,\n"
-    "                                     local float8 *restrict localfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector8_p(local const char8* localconstchar8p,\n"
-    "                                  local const uchar8 * localconstuchar8p,\n"
-    "                                  local const short8*localconstshort8p,\n"
-    "                                  local const ushort8 *localconstushort8p,\n"
-    "                                  local const int8* localconstint8p,\n"
-    "                                  local const uint8 * localconstuint8p,\n"
-    "                                  local const float8* localconstfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector8_restrict_p(local const char8 * restrict localconstchar8restrictp,\n"
-    "                                           local const uchar8*restrict localconstuchar8restrictp,\n"
-    "                                           local const short8 *restrict localconstshort8restrictp,\n"
-    "                                           local const ushort8* restrict localconstushort8restrictp,\n"
-    "                                           local const int8 * restrict localconstint8restrictp,\n"
-    "                                           local const uint8*restrict localconstuint8restrictp,\n"
-    "                                           local const float8 * restrict localconstfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector8_p(local volatile char8*localvolatilechar8p,\n"
-    "                                     local volatile uchar8 *localvolatileuchar8p,\n"
-    "                                     local volatile short8* localvolatileshort8p,\n"
-    "                                     local volatile ushort8 * localvolatileushort8p,\n"
-    "                                     local volatile int8*localvolatileint8p,\n"
-    "                                     local volatile uint8 *localvolatileuint8p,\n"
-    "                                     local volatile float8*localvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector8_restrict_p(local volatile char8 *restrict localvolatilechar8restrictp,\n"
-    "                                              local volatile uchar8* restrict localvolatileuchar8restrictp,\n"
-    "                                              local volatile short8 * restrict localvolatileshort8restrictp,\n"
-    "                                              local volatile ushort8*restrict localvolatileushort8restrictp,\n"
-    "                                              local volatile int8 *restrict localvolatileint8restrictp,\n"
-    "                                              local volatile uint8* restrict localvolatileuint8restrictp,\n"
-    "                                              local volatile float8 *restrict localvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector8_p(local const volatile char8* localconstvolatilechar8p,\n"
-    "                                           local const volatile uchar8 * localconstvolatileuchar8p,\n"
-    "                                           local const volatile short8*localconstvolatileshort8p,\n"
-    "                                           local const volatile ushort8 *localconstvolatileushort8p,\n"
-    "                                           local const volatile int8* localconstvolatileint8p,\n"
-    "                                           local const volatile uint8 * localconstvolatileuint8p,\n"
-    "                                           local const volatile float8* localconstvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector8_restrict_p(local const volatile char8 * restrict localconstvolatilechar8restrictp,\n"
-    "                                                    local const volatile uchar8*restrict localconstvolatileuchar8restrictp,\n"
-    "                                                    local const volatile short8 *restrict localconstvolatileshort8restrictp,\n"
-    "                                                    local const volatile ushort8* restrict localconstvolatileushort8restrictp,\n"
-    "                                                    local const volatile int8 * restrict localconstvolatileint8restrictp,\n"
-    "                                                    local const volatile uint8*restrict localconstvolatileuint8restrictp,\n"
-    "                                                    local const volatile float8 * restrict localconstvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector8_d(char8 char8d,\n"
-    "                      uchar8 uchar8d,\n"
-    "                      short8 short8d,\n"
-    "                      ushort8 ushort8d,\n"
-    "                      int8 int8d,\n"
-    "                      uint8 uint8d,\n"
-    "                      float8 float8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector8_d(const char8 constchar8d,\n"
-    "                            const uchar8 constuchar8d,\n"
-    "                            const short8 constshort8d,\n"
-    "                            const ushort8 constushort8d,\n"
-    "                            const int8 constint8d,\n"
-    "                            const uint8 constuint8d,\n"
-    "                            const float8 constfloat8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector8_d(private char8 privatechar8d,\n"
-    "                              private uchar8 privateuchar8d,\n"
-    "                              private short8 privateshort8d,\n"
-    "                              private ushort8 privateushort8d,\n"
-    "                              private int8 privateint8d,\n"
-    "                              private uint8 privateuint8d,\n"
-    "                              private float8 privatefloat8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector8_d(private const char8 privateconstchar8d,\n"
-    "                                    private const uchar8 privateconstuchar8d,\n"
-    "                                    private const short8 privateconstshort8d,\n"
-    "                                    private const ushort8 privateconstushort8d,\n"
-    "                                    private const int8 privateconstint8d,\n"
-    "                                    private const uint8 privateconstuint8d,\n"
-    "                                    private const float8 privateconstfloat8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_p0(constant char16*constantchar16p,\n"
-    "                                constant uchar16 *constantuchar16p,\n"
-    "                                constant short16* constantshort16p,\n"
-    "                                constant ushort16 * constantushort16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_p1(constant int16*constantint16p,\n"
-    "                                constant uint16 *constantuint16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_p2(constant float16*constantfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_restrict_p0(constant char16 *restrict constantchar16restrictp,\n"
-    "                                         constant uchar16* restrict constantuchar16restrictp,\n"
-    "                                         constant short16 * restrict constantshort16restrictp,\n"
-    "                                         constant ushort16*restrict constantushort16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_restrict_p1(constant int16 *restrict constantint16restrictp,\n"
-    "                                         constant uint16* restrict constantuint16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_restrict_p2(constant float16 *restrict constantfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector16_p(global char16*globalchar16p,\n"
-    "                              global uchar16 *globaluchar16p,\n"
-    "                              global short16* globalshort16p,\n"
-    "                              global ushort16 * globalushort16p,\n"
-    "                              global int16*globalint16p,\n"
-    "                              global uint16 *globaluint16p,\n"
-    "                              global float16*globalfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector16_restrict_p(global char16 *restrict globalchar16restrictp,\n"
-    "                                       global uchar16* restrict globaluchar16restrictp,\n"
-    "                                       global short16 * restrict globalshort16restrictp,\n"
-    "                                       global ushort16*restrict globalushort16restrictp,\n"
-    "                                       global int16 *restrict globalint16restrictp,\n"
-    "                                       global uint16* restrict globaluint16restrictp,\n"
-    "                                       global float16 *restrict globalfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector16_p(global const char16* globalconstchar16p,\n"
-    "                                    global const uchar16 * globalconstuchar16p,\n"
-    "                                    global const short16*globalconstshort16p,\n"
-    "                                    global const ushort16 *globalconstushort16p,\n"
-    "                                    global const int16* globalconstint16p,\n"
-    "                                    global const uint16 * globalconstuint16p,\n"
-    "                                    global const float16* globalconstfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector16_restrict_p(global const char16 * restrict globalconstchar16restrictp,\n"
-    "                                             global const uchar16*restrict globalconstuchar16restrictp,\n"
-    "                                             global const short16 *restrict globalconstshort16restrictp,\n"
-    "                                             global const ushort16* restrict globalconstushort16restrictp,\n"
-    "                                             global const int16 * restrict globalconstint16restrictp,\n"
-    "                                             global const uint16*restrict globalconstuint16restrictp,\n"
-    "                                             global const float16 * restrict globalconstfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector16_p(global volatile char16*globalvolatilechar16p,\n"
-    "                                       global volatile uchar16 *globalvolatileuchar16p,\n"
-    "                                       global volatile short16* globalvolatileshort16p,\n"
-    "                                       global volatile ushort16 * globalvolatileushort16p,\n"
-    "                                       global volatile int16*globalvolatileint16p,\n"
-    "                                       global volatile uint16 *globalvolatileuint16p,\n"
-    "                                       global volatile float16*globalvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector16_restrict_p(global volatile char16 *restrict globalvolatilechar16restrictp,\n"
-    "                                                global volatile uchar16* restrict globalvolatileuchar16restrictp,\n"
-    "                                                global volatile short16 * restrict globalvolatileshort16restrictp,\n"
-    "                                                global volatile ushort16*restrict globalvolatileushort16restrictp,\n"
-    "                                                global volatile int16 *restrict globalvolatileint16restrictp,\n"
-    "                                                global volatile uint16* restrict globalvolatileuint16restrictp,\n"
-    "                                                global volatile float16 *restrict globalvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector16_p(global const volatile char16* globalconstvolatilechar16p,\n"
-    "                                             global const volatile uchar16 * globalconstvolatileuchar16p,\n"
-    "                                             global const volatile short16*globalconstvolatileshort16p,\n"
-    "                                             global const volatile ushort16 *globalconstvolatileushort16p,\n"
-    "                                             global const volatile int16* globalconstvolatileint16p,\n"
-    "                                             global const volatile uint16 * globalconstvolatileuint16p,\n"
-    "                                             global const volatile float16* globalconstvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector16_restrict_p(global const volatile char16 * restrict globalconstvolatilechar16restrictp,\n"
-    "                                                      global const volatile uchar16*restrict globalconstvolatileuchar16restrictp,\n"
-    "                                                      global const volatile short16 *restrict globalconstvolatileshort16restrictp,\n"
-    "                                                      global const volatile ushort16* restrict globalconstvolatileushort16restrictp,\n"
-    "                                                      global const volatile int16 * restrict globalconstvolatileint16restrictp,\n"
-    "                                                      global const volatile uint16*restrict globalconstvolatileuint16restrictp,\n"
-    "                                                      global const volatile float16 * restrict globalconstvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector16_p(local char16*localchar16p,\n"
-    "                             local uchar16 *localuchar16p,\n"
-    "                             local short16* localshort16p,\n"
-    "                             local ushort16 * localushort16p,\n"
-    "                             local int16*localint16p,\n"
-    "                             local uint16 *localuint16p,\n"
-    "                             local float16*localfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector16_restrict_p(local char16 *restrict localchar16restrictp,\n"
-    "                                      local uchar16* restrict localuchar16restrictp,\n"
-    "                                      local short16 * restrict localshort16restrictp,\n"
-    "                                      local ushort16*restrict localushort16restrictp,\n"
-    "                                      local int16 *restrict localint16restrictp,\n"
-    "                                      local uint16* restrict localuint16restrictp,\n"
-    "                                      local float16 *restrict localfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector16_p(local const char16* localconstchar16p,\n"
-    "                                   local const uchar16 * localconstuchar16p,\n"
-    "                                   local const short16*localconstshort16p,\n"
-    "                                   local const ushort16 *localconstushort16p,\n"
-    "                                   local const int16* localconstint16p,\n"
-    "                                   local const uint16 * localconstuint16p,\n"
-    "                                   local const float16* localconstfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector16_restrict_p(local const char16 * restrict localconstchar16restrictp,\n"
-    "                                            local const uchar16*restrict localconstuchar16restrictp,\n"
-    "                                            local const short16 *restrict localconstshort16restrictp,\n"
-    "                                            local const ushort16* restrict localconstushort16restrictp,\n"
-    "                                            local const int16 * restrict localconstint16restrictp,\n"
-    "                                            local const uint16*restrict localconstuint16restrictp,\n"
-    "                                            local const float16 * restrict localconstfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector16_p(local volatile char16*localvolatilechar16p,\n"
-    "                                      local volatile uchar16 *localvolatileuchar16p,\n"
-    "                                      local volatile short16* localvolatileshort16p,\n"
-    "                                      local volatile ushort16 * localvolatileushort16p,\n"
-    "                                      local volatile int16*localvolatileint16p,\n"
-    "                                      local volatile uint16 *localvolatileuint16p,\n"
-    "                                      local volatile float16*localvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector16_restrict_p(local volatile char16 *restrict localvolatilechar16restrictp,\n"
-    "                                               local volatile uchar16* restrict localvolatileuchar16restrictp,\n"
-    "                                               local volatile short16 * restrict localvolatileshort16restrictp,\n"
-    "                                               local volatile ushort16*restrict localvolatileushort16restrictp,\n"
-    "                                               local volatile int16 *restrict localvolatileint16restrictp,\n"
-    "                                               local volatile uint16* restrict localvolatileuint16restrictp,\n"
-    "                                               local volatile float16 *restrict localvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector16_p(local const volatile char16* localconstvolatilechar16p,\n"
-    "                                            local const volatile uchar16 * localconstvolatileuchar16p,\n"
-    "                                            local const volatile short16*localconstvolatileshort16p,\n"
-    "                                            local const volatile ushort16 *localconstvolatileushort16p,\n"
-    "                                            local const volatile int16* localconstvolatileint16p,\n"
-    "                                            local const volatile uint16 * localconstvolatileuint16p,\n"
-    "                                            local const volatile float16* localconstvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector16_restrict_p(local const volatile char16 * restrict localconstvolatilechar16restrictp,\n"
-    "                                                     local const volatile uchar16*restrict localconstvolatileuchar16restrictp,\n"
-    "                                                     local const volatile short16 *restrict localconstvolatileshort16restrictp,\n"
-    "                                                     local const volatile ushort16* restrict localconstvolatileushort16restrictp,\n"
-    "                                                     local const volatile int16 * restrict localconstvolatileint16restrictp,\n"
-    "                                                     local const volatile uint16*restrict localconstvolatileuint16restrictp,\n"
-    "                                                     local const volatile float16 * restrict localconstvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector16_d(char16 char16d,\n"
-    "                       uchar16 uchar16d,\n"
-    "                       short16 short16d,\n"
-    "                       ushort16 ushort16d,\n"
-    "                       int16 int16d,\n"
-    "                       uint16 uint16d,\n"
-    "                       float16 float16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector16_d(const char16 constchar16d,\n"
-    "                             const uchar16 constuchar16d,\n"
-    "                             const short16 constshort16d,\n"
-    "                             const ushort16 constushort16d,\n"
-    "                             const int16 constint16d,\n"
-    "                             const uint16 constuint16d,\n"
-    "                             const float16 constfloat16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector16_d(private char16 privatechar16d,\n"
-    "                               private uchar16 privateuchar16d,\n"
-    "                               private short16 privateshort16d,\n"
-    "                               private ushort16 privateushort16d,\n"
-    "                               private int16 privateint16d,\n"
-    "                               private uint16 privateuint16d,\n"
-    "                               private float16 privatefloat16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector16_d(private const char16 privateconstchar16d,\n"
-    "                                     private const uchar16 privateconstuchar16d,\n"
-    "                                     private const short16 privateconstshort16d,\n"
-    "                                     private const ushort16 privateconstushort16d,\n"
-    "                                     private const int16 privateconstint16d,\n"
-    "                                     private const uint16 privateconstuint16d,\n"
-    "                                     private const float16 privateconstfloat16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_p0(constant typedef_type*constanttypedef_typep,\n"
-    "                               constant struct struct_type *constantstructstruct_typep,\n"
-    "                               constant typedef_struct_type* constanttypedef_struct_typep,\n"
-    "                               constant union union_type * constantunionunion_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_p1(constant typedef_union_type*constanttypedef_union_typep,\n"
-    "                               constant enum enum_type *constantenumenum_typep,\n"
-    "                               constant typedef_enum_type* constanttypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_restrict_p0(constant typedef_type * restrict constanttypedef_typerestrictp,\n"
-    "                                        constant struct struct_type*restrict constantstructstruct_typerestrictp,\n"
-    "                                        constant typedef_struct_type *restrict constanttypedef_struct_typerestrictp,\n"
-    "                                        constant union union_type* restrict constantunionunion_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_restrict_p1(constant typedef_union_type * restrict constanttypedef_union_typerestrictp,\n"
-    "                                        constant enum enum_type*restrict constantenumenum_typerestrictp,\n"
-    "                                        constant typedef_enum_type *restrict constanttypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_derived_p(global typedef_type*globaltypedef_typep,\n"
-    "                             global struct struct_type *globalstructstruct_typep,\n"
-    "                             global typedef_struct_type* globaltypedef_struct_typep,\n"
-    "                             global union union_type * globalunionunion_typep,\n"
-    "                             global typedef_union_type*globaltypedef_union_typep,\n"
-    "                             global enum enum_type *globalenumenum_typep,\n"
-    "                             global typedef_enum_type* globaltypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_derived_restrict_p(global typedef_type * restrict globaltypedef_typerestrictp,\n"
-    "                                      global struct struct_type*restrict globalstructstruct_typerestrictp,\n"
-    "                                      global typedef_struct_type *restrict globaltypedef_struct_typerestrictp,\n"
-    "                                      global union union_type* restrict globalunionunion_typerestrictp,\n"
-    "                                      global typedef_union_type * restrict globaltypedef_union_typerestrictp,\n"
-    "                                      global enum enum_type*restrict globalenumenum_typerestrictp,\n"
-    "                                      global typedef_enum_type *restrict globaltypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_derived_p(global const typedef_type* globalconsttypedef_typep,\n"
-    "                                   global const struct struct_type * globalconststructstruct_typep,\n"
-    "                                   global const typedef_struct_type*globalconsttypedef_struct_typep,\n"
-    "                                   global const union union_type *globalconstunionunion_typep,\n"
-    "                                   global const typedef_union_type* globalconsttypedef_union_typep,\n"
-    "                                   global const enum enum_type * globalconstenumenum_typep,\n"
-    "                                   global const typedef_enum_type*globalconsttypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_derived_restrict_p(global const typedef_type *restrict globalconsttypedef_typerestrictp,\n"
-    "                                            global const struct struct_type* restrict globalconststructstruct_typerestrictp,\n"
-    "                                            global const typedef_struct_type * restrict globalconsttypedef_struct_typerestrictp,\n"
-    "                                            global const union union_type*restrict globalconstunionunion_typerestrictp,\n"
-    "                                            global const typedef_union_type *restrict globalconsttypedef_union_typerestrictp,\n"
-    "                                            global const enum enum_type* restrict globalconstenumenum_typerestrictp,\n"
-    "                                            global const typedef_enum_type * restrict globalconsttypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_derived_p(global volatile typedef_type*globalvolatiletypedef_typep,\n"
-    "                                      global volatile struct struct_type *globalvolatilestructstruct_typep,\n"
-    "                                      global volatile typedef_struct_type* globalvolatiletypedef_struct_typep,\n"
-    "                                      global volatile union union_type * globalvolatileunionunion_typep,\n"
-    "                                      global volatile typedef_union_type*globalvolatiletypedef_union_typep,\n"
-    "                                      global volatile enum enum_type *globalvolatileenumenum_typep,\n"
-    "                                      global volatile typedef_enum_type* globalvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_derived_restrict_p(global volatile typedef_type * restrict globalvolatiletypedef_typerestrictp,\n"
-    "                                               global volatile struct struct_type*restrict globalvolatilestructstruct_typerestrictp,\n"
-    "                                               global volatile typedef_struct_type *restrict globalvolatiletypedef_struct_typerestrictp,\n"
-    "                                               global volatile union union_type* restrict globalvolatileunionunion_typerestrictp,\n"
-    "                                               global volatile typedef_union_type * restrict globalvolatiletypedef_union_typerestrictp,\n"
-    "                                               global volatile enum enum_type*restrict globalvolatileenumenum_typerestrictp,\n"
-    "                                               global volatile typedef_enum_type *restrict globalvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_derived_p(global const volatile typedef_type* globalconstvolatiletypedef_typep,\n"
-    "                                            global const volatile struct struct_type * globalconstvolatilestructstruct_typep,\n"
-    "                                            global const volatile typedef_struct_type*globalconstvolatiletypedef_struct_typep,\n"
-    "                                            global const volatile union union_type *globalconstvolatileunionunion_typep,\n"
-    "                                            global const volatile typedef_union_type* globalconstvolatiletypedef_union_typep,\n"
-    "                                            global const volatile enum enum_type * globalconstvolatileenumenum_typep,\n"
-    "                                            global const volatile typedef_enum_type*globalconstvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_derived_restrict_p(global const volatile typedef_type *restrict globalconstvolatiletypedef_typerestrictp,\n"
-    "                                                     global const volatile struct struct_type* restrict globalconstvolatilestructstruct_typerestrictp,\n"
-    "                                                     global const volatile typedef_struct_type * restrict globalconstvolatiletypedef_struct_typerestrictp,\n"
-    "                                                     global const volatile union union_type*restrict globalconstvolatileunionunion_typerestrictp,\n"
-    "                                                     global const volatile typedef_union_type *restrict globalconstvolatiletypedef_union_typerestrictp,\n"
-    "                                                     global const volatile enum enum_type* restrict globalconstvolatileenumenum_typerestrictp,\n"
-    "                                                     global const volatile typedef_enum_type * restrict globalconstvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_derived_p(local typedef_type*localtypedef_typep,\n"
-    "                            local struct struct_type *localstructstruct_typep,\n"
-    "                            local typedef_struct_type* localtypedef_struct_typep,\n"
-    "                            local union union_type * localunionunion_typep,\n"
-    "                            local typedef_union_type*localtypedef_union_typep,\n"
-    "                            local enum enum_type *localenumenum_typep,\n"
-    "                            local typedef_enum_type* localtypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_derived_restrict_p(local typedef_type * restrict localtypedef_typerestrictp,\n"
-    "                                     local struct struct_type*restrict localstructstruct_typerestrictp,\n"
-    "                                     local typedef_struct_type *restrict localtypedef_struct_typerestrictp,\n"
-    "                                     local union union_type* restrict localunionunion_typerestrictp,\n"
-    "                                     local typedef_union_type * restrict localtypedef_union_typerestrictp,\n"
-    "                                     local enum enum_type*restrict localenumenum_typerestrictp,\n"
-    "                                     local typedef_enum_type *restrict localtypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_derived_p(local const typedef_type* localconsttypedef_typep,\n"
-    "                                  local const struct struct_type * localconststructstruct_typep,\n"
-    "                                  local const typedef_struct_type*localconsttypedef_struct_typep,\n"
-    "                                  local const union union_type *localconstunionunion_typep,\n"
-    "                                  local const typedef_union_type* localconsttypedef_union_typep,\n"
-    "                                  local const enum enum_type * localconstenumenum_typep,\n"
-    "                                  local const typedef_enum_type*localconsttypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_derived_restrict_p(local const typedef_type *restrict localconsttypedef_typerestrictp,\n"
-    "                                           local const struct struct_type* restrict localconststructstruct_typerestrictp,\n"
-    "                                           local const typedef_struct_type * restrict localconsttypedef_struct_typerestrictp,\n"
-    "                                           local const union union_type*restrict localconstunionunion_typerestrictp,\n"
-    "                                           local const typedef_union_type *restrict localconsttypedef_union_typerestrictp,\n"
-    "                                           local const enum enum_type* restrict localconstenumenum_typerestrictp,\n"
-    "                                           local const typedef_enum_type * restrict localconsttypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_derived_p(local volatile typedef_type*localvolatiletypedef_typep,\n"
-    "                                     local volatile struct struct_type *localvolatilestructstruct_typep,\n"
-    "                                     local volatile typedef_struct_type* localvolatiletypedef_struct_typep,\n"
-    "                                     local volatile union union_type * localvolatileunionunion_typep,\n"
-    "                                     local volatile typedef_union_type*localvolatiletypedef_union_typep,\n"
-    "                                     local volatile enum enum_type *localvolatileenumenum_typep,\n"
-    "                                     local volatile typedef_enum_type* localvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_derived_restrict_p(local volatile typedef_type * restrict localvolatiletypedef_typerestrictp,\n"
-    "                                              local volatile struct struct_type*restrict localvolatilestructstruct_typerestrictp,\n"
-    "                                              local volatile typedef_struct_type *restrict localvolatiletypedef_struct_typerestrictp,\n"
-    "                                              local volatile union union_type* restrict localvolatileunionunion_typerestrictp,\n"
-    "                                              local volatile typedef_union_type * restrict localvolatiletypedef_union_typerestrictp,\n"
-    "                                              local volatile enum enum_type*restrict localvolatileenumenum_typerestrictp,\n"
-    "                                              local volatile typedef_enum_type *restrict localvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_derived_p(local const volatile typedef_type* localconstvolatiletypedef_typep,\n"
-    "                                           local const volatile struct struct_type * localconstvolatilestructstruct_typep,\n"
-    "                                           local const volatile typedef_struct_type*localconstvolatiletypedef_struct_typep,\n"
-    "                                           local const volatile union union_type *localconstvolatileunionunion_typep,\n"
-    "                                           local const volatile typedef_union_type* localconstvolatiletypedef_union_typep,\n"
-    "                                           local const volatile enum enum_type * localconstvolatileenumenum_typep,\n"
-    "                                           local const volatile typedef_enum_type*localconstvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_derived_restrict_p(local const volatile typedef_type *restrict localconstvolatiletypedef_typerestrictp,\n"
-    "                                                    local const volatile struct struct_type* restrict localconstvolatilestructstruct_typerestrictp,\n"
-    "                                                    local const volatile typedef_struct_type * restrict localconstvolatiletypedef_struct_typerestrictp,\n"
-    "                                                    local const volatile union union_type*restrict localconstvolatileunionunion_typerestrictp,\n"
-    "                                                    local const volatile typedef_union_type *restrict localconstvolatiletypedef_union_typerestrictp,\n"
-    "                                                    local const volatile enum enum_type* restrict localconstvolatileenumenum_typerestrictp,\n"
-    "                                                    local const volatile typedef_enum_type * restrict localconstvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void derived_d(typedef_type typedef_typed,\n"
-    "                      struct struct_type structstruct_typed,\n"
-    "                      typedef_struct_type typedef_struct_typed,\n"
-    "                      union union_type unionunion_typed,\n"
-    "                      typedef_union_type typedef_union_typed,\n"
-    "                      enum enum_type enumenum_typed,\n"
-    "                      typedef_enum_type typedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_derived_d(const typedef_type consttypedef_typed,\n"
-    "                            const struct struct_type conststructstruct_typed,\n"
-    "                            const typedef_struct_type consttypedef_struct_typed,\n"
-    "                            const union union_type constunionunion_typed,\n"
-    "                            const typedef_union_type consttypedef_union_typed,\n"
-    "                            const enum enum_type constenumenum_typed,\n"
-    "                            const typedef_enum_type consttypedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_derived_d(private typedef_type privatetypedef_typed,\n"
-    "                              private struct struct_type privatestructstruct_typed,\n"
-    "                              private typedef_struct_type privatetypedef_struct_typed,\n"
-    "                              private union union_type privateunionunion_typed,\n"
-    "                              private typedef_union_type privatetypedef_union_typed,\n"
-    "                              private enum enum_type privateenumenum_typed,\n"
-    "                              private typedef_enum_type privatetypedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_derived_d(private const typedef_type privateconsttypedef_typed,\n"
-    "                                    private const struct struct_type privateconststructstruct_typed,\n"
-    "                                    private const typedef_struct_type privateconsttypedef_struct_typed,\n"
-    "                                    private const union union_type privateconstunionunion_typed,\n"
-    "                                    private const typedef_union_type privateconsttypedef_union_typed,\n"
-    "                                    private const enum enum_type privateconstenumenum_typed,\n"
-    "                                    private const typedef_enum_type privateconsttypedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-};
-
-const char * required_arg_info[][72] = {
-    // The minimum value of CL_DEVICE_MAX_CONSTANT_ARGS is 4
-    {
-        "constant_scalar_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "void*", "constantvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char*", "constantcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "constantucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "constantunsignedcharp",
-        NULL
-  },
-  {
-        "constant_scalar_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short*", "constantshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "constantushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "constantunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int*", "constantintp",
-        NULL
-  },
-  {
-        "constant_scalar_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "constantuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "constantunsignedintp",
-        NULL
-  },
-  {
-        "constant_scalar_p3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float*", "constantfloatp",
-        NULL
-    },
-    {
-        "constant_scalar_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "constantvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "constantcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "constantucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "constantunsignedcharrestrictp",
-        NULL
-  },
-  {
-        "constant_scalar_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "constantshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "constantushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "constantunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "constantintrestrictp",
-        NULL
-  },
-  {
-        "constant_scalar_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "constantuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "constantunsignedintrestrictp",
-        NULL
-  },
-  {
-        "constant_scalar_restrict_p3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "constantfloatrestrictp",
-        NULL
-    },
-    {
-        "global_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "void*", "globalvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char*", "globalcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "globalucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "globalunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short*", "globalshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "globalushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "globalunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int*", "globalintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "globaluintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "globalunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float*", "globalfloatp",
-        NULL
-    },
-    {
-        "global_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globaluintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalfloatrestrictp",
-        NULL
-    },
-    {
-        "global_const_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "void*", "globalconstvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char*", "globalconstcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "globalconstucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "globalconstunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short*", "globalconstshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "globalconstushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "globalconstunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int*", "globalconstintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "globalconstuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "globalconstunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float*", "globalconstfloatp",
-        NULL
-    },
-    {
-        "global_const_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalconstvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalconstcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalconstshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalconstintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalconstfloatrestrictp",
-        NULL
-    },
-    {
-        "global_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "globalvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "globalvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "globalvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "globalvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "globalvolatilefloatp",
-        NULL
-    },
-    {
-        "global_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "globalconstvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "globalconstvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalconstvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalconstvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "globalconstvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalconstvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalconstvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "globalconstvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalconstvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalconstvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "globalconstvolatilefloatp",
-        NULL
-    },
-    {
-        "global_const_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalconstvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalconstvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalconstvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalconstvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalconstvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "local_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "void*", "localvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char*", "localcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "localucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "localunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short*", "localshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "localushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "localunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int*", "localintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "localuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "localunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float*", "localfloatp",
-        NULL
-    },
-    {
-        "local_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localfloatrestrictp",
-        NULL
-    },
-    {
-        "local_const_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "void*", "localconstvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char*", "localconstcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "localconstucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "localconstunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short*", "localconstshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "localconstushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "localconstunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int*", "localconstintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "localconstuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "localconstunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float*", "localconstfloatp",
-        NULL
-    },
-    {
-        "local_const_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localconstvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localconstcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localconstshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localconstintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localconstfloatrestrictp",
-        NULL
-    },
-    {
-        "local_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "localvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "localvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "localvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "localvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "localvolatilefloatp",
-        NULL
-    },
-    {
-        "local_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "localconstvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "localconstvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localconstvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localconstvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "localconstvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localconstvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localconstvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "localconstvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localconstvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localconstvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "localconstvolatilefloatp",
-        NULL
-    },
-    {
-        "local_const_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localconstvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localconstvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localconstvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localconstvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localconstvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "chard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "uchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "unsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "shortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "ushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "unsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "intd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "uintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "unsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "floatd",
-        NULL
-    },
-    {
-        "const_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "constchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "constuchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "constunsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "constshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "constushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "constunsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "constintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "constuintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "constunsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "constfloatd",
-        NULL
-    },
-    {
-        "private_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "privatechard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateuchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateunsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "privateshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateunsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "privateintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateuintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateunsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "privatefloatd",
-        NULL
-    },
-    {
-        "private_const_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "privateconstchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateconstuchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateconstunsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "privateconstshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateconstushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateconstunsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "privateconstintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateconstuintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateconstunsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "privateconstfloatd",
-        NULL
-    },
-    {
-        "constant_vector2_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char2*", "constantchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar2*", "constantuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short2*", "constantshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort2*", "constantushort2p",
-        NULL
-    },
-    {
-        "constant_vector2_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int2*", "constantint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint2*", "constantuint2p",
-        NULL
-    },
-    {
-        "constant_vector2_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float2*", "constantfloat2p",
-        NULL
-    },
-    {
-        "constant_vector2_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "constantchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "constantuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "constantshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "constantushort2restrictp",
-        NULL
-    },
-    {
-        "constant_vector2_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "constantint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "constantuint2restrictp",
-        NULL
-    },
-    {
-        "constant_vector2_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "constantfloat2restrictp",
-        NULL
-    },
-    {
-        "global_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2*", "globalchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2*", "globaluchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2*", "globalshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2*", "globalushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2*", "globalint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2*", "globaluint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2*", "globalfloat2p",
-        NULL
-    },
-    {
-        "global_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globaluchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globaluint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalfloat2restrictp",
-        NULL
-    },
-    {
-        "global_const_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char2*", "globalconstchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar2*", "globalconstuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short2*", "globalconstshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort2*", "globalconstushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int2*", "globalconstint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint2*", "globalconstuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float2*", "globalconstfloat2p",
-        NULL
-    },
-    {
-        "global_const_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalconstchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globalconstuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalconstshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalconstushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalconstint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globalconstuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalconstfloat2restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "globalvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "globalvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "globalvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "globalvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "globalvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "globalvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "globalvolatilefloat2p",
-        NULL
-    },
-    {
-        "global_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globalvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globalvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "globalconstvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "globalconstvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "globalconstvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "globalconstvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "globalconstvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "globalconstvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "globalconstvolatilefloat2p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalconstvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globalconstvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalconstvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalconstvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalconstvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globalconstvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalconstvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "local_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2*", "localchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2*", "localuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2*", "localshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2*", "localushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2*", "localint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2*", "localuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2*", "localfloat2p",
-        NULL
-    },
-    {
-        "local_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localfloat2restrictp",
-        NULL
-    },
-    {
-        "local_const_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char2*", "localconstchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar2*", "localconstuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short2*", "localconstshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort2*", "localconstushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int2*", "localconstint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint2*", "localconstuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float2*", "localconstfloat2p",
-        NULL
-    },
-    {
-        "local_const_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localconstchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localconstuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localconstshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localconstushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localconstint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localconstuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localconstfloat2restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "localvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "localvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "localvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "localvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "localvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "localvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "localvolatilefloat2p",
-        NULL
-    },
-    {
-        "local_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "localconstvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "localconstvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "localconstvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "localconstvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "localconstvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "localconstvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "localconstvolatilefloat2p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localconstvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localconstvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localconstvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localconstvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localconstvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localconstvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localconstvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "char2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "uchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "short2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "ushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "int2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "uint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "float2d",
-        NULL
-    },
-    {
-        "const_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "constchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "constuchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "constshort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "constushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "constint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "constuint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "constfloat2d",
-        NULL
-    },
-    {
-        "private_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "privatechar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "privateuchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "privateshort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "privateushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "privateint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "privateuint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "privatefloat2d",
-        NULL
-    },
-    {
-        "private_const_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "privateconstchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "privateconstuchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "privateconstshort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "privateconstushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "privateconstint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "privateconstuint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "privateconstfloat2d",
-        NULL
-    },
-    {
-        "constant_vector3_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char3*", "constantchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar3*", "constantuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short3*", "constantshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort3*", "constantushort3p",
-        NULL
-    },
-    {
-        "constant_vector3_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int3*", "constantint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint3*", "constantuint3p",
-        NULL
-    },
-    {
-        "constant_vector3_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float3*", "constantfloat3p",
-        NULL
-    },
-    {
-        "constant_vector3_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "constantchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "constantuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "constantshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "constantushort3restrictp",
-        NULL
-    },
-    {
-        "constant_vector3_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "constantint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "constantuint3restrictp",
-        NULL
-    },
-    {
-        "constant_vector3_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "constantfloat3restrictp",
-        NULL
-    },
-    {
-        "global_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3*", "globalchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3*", "globaluchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3*", "globalshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3*", "globalushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3*", "globalint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3*", "globaluint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3*", "globalfloat3p",
-        NULL
-    },
-    {
-        "global_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globaluchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globaluint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalfloat3restrictp",
-        NULL
-    },
-    {
-        "global_const_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char3*", "globalconstchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar3*", "globalconstuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short3*", "globalconstshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort3*", "globalconstushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int3*", "globalconstint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint3*", "globalconstuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float3*", "globalconstfloat3p",
-        NULL
-    },
-    {
-        "global_const_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalconstchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globalconstuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalconstshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalconstushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalconstint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globalconstuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalconstfloat3restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "globalvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "globalvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "globalvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "globalvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "globalvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "globalvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "globalvolatilefloat3p",
-        NULL
-    },
-    {
-        "global_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globalvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globalvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "globalconstvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "globalconstvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "globalconstvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "globalconstvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "globalconstvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "globalconstvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "globalconstvolatilefloat3p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalconstvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globalconstvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalconstvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalconstvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalconstvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globalconstvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalconstvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "local_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3*", "localchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3*", "localuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3*", "localshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3*", "localushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3*", "localint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3*", "localuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3*", "localfloat3p",
-        NULL
-    },
-    {
-        "local_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localfloat3restrictp",
-        NULL
-    },
-    {
-        "local_const_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char3*", "localconstchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar3*", "localconstuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short3*", "localconstshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort3*", "localconstushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int3*", "localconstint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint3*", "localconstuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float3*", "localconstfloat3p",
-        NULL
-    },
-    {
-        "local_const_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localconstchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localconstuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localconstshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localconstushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localconstint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localconstuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localconstfloat3restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "localvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "localvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "localvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "localvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "localvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "localvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "localvolatilefloat3p",
-        NULL
-    },
-    {
-        "local_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "localconstvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "localconstvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "localconstvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "localconstvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "localconstvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "localconstvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "localconstvolatilefloat3p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localconstvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localconstvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localconstvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localconstvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localconstvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localconstvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localconstvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "char3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "uchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "short3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "ushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "int3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "uint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "float3d",
-        NULL
-    },
-    {
-        "const_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "constchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "constuchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "constshort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "constushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "constint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "constuint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "constfloat3d",
-        NULL
-    },
-    {
-        "private_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "privatechar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "privateuchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "privateshort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "privateushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "privateint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "privateuint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "privatefloat3d",
-        NULL
-    },
-    {
-        "private_const_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "privateconstchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "privateconstuchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "privateconstshort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "privateconstushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "privateconstint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "privateconstuint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "privateconstfloat3d",
-        NULL
-    },
-    {
-        "constant_vector4_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char4*", "constantchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar4*", "constantuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short4*", "constantshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort4*", "constantushort4p",
-        NULL
-    },
-    {
-        "constant_vector4_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int4*", "constantint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint4*", "constantuint4p",
-        NULL
-    },
-    {
-        "constant_vector4_p2",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float4*", "constantfloat4p",
-        NULL
-    },
-    {
-        "constant_vector4_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "constantchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "constantuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "constantshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "constantushort4restrictp",
-        NULL
-    },
-    {
-        "constant_vector4_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "constantint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "constantuint4restrictp",
-        NULL
-    },
-    {
-        "constant_vector4_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "constantfloat4restrictp",
-        NULL
-    },
-    {
-        "global_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4*", "globalchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4*", "globaluchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4*", "globalshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4*", "globalushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4*", "globalint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4*", "globaluint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4*", "globalfloat4p",
-        NULL
-    },
-    {
-        "global_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globaluchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globaluint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalfloat4restrictp",
-        NULL
-    },
-    {
-        "global_const_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char4*", "globalconstchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar4*", "globalconstuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short4*", "globalconstshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort4*", "globalconstushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int4*", "globalconstint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint4*", "globalconstuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float4*", "globalconstfloat4p",
-        NULL
-    },
-    {
-        "global_const_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalconstchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globalconstuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalconstshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalconstushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalconstint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globalconstuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalconstfloat4restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "globalvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "globalvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "globalvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "globalvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "globalvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "globalvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "globalvolatilefloat4p",
-        NULL
-    },
-    {
-        "global_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globalvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globalvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "globalconstvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "globalconstvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "globalconstvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "globalconstvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "globalconstvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "globalconstvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "globalconstvolatilefloat4p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalconstvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globalconstvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalconstvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalconstvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalconstvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globalconstvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalconstvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "local_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4*", "localchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4*", "localuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4*", "localshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4*", "localushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4*", "localint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4*", "localuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4*", "localfloat4p",
-        NULL
-    },
-    {
-        "local_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localfloat4restrictp",
-        NULL
-    },
-    {
-        "local_const_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char4*", "localconstchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar4*", "localconstuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short4*", "localconstshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort4*", "localconstushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int4*", "localconstint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint4*", "localconstuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float4*", "localconstfloat4p",
-        NULL
-    },
-    {
-        "local_const_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localconstchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localconstuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localconstshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localconstushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localconstint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localconstuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localconstfloat4restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "localvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "localvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "localvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "localvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "localvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "localvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "localvolatilefloat4p",
-        NULL
-    },
-    {
-        "local_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "localconstvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "localconstvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "localconstvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "localconstvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "localconstvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "localconstvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "localconstvolatilefloat4p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localconstvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localconstvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localconstvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localconstvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localconstvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localconstvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localconstvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "char4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "uchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "short4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "ushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "int4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "uint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "float4d",
-        NULL
-    },
-    {
-        "const_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "constchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "constuchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "constshort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "constushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "constint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "constuint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "constfloat4d",
-        NULL
-    },
-    {
-        "private_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "privatechar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "privateuchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "privateshort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "privateushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "privateint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "privateuint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "privatefloat4d",
-        NULL
-    },
-    {
-        "private_const_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "privateconstchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "privateconstuchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "privateconstshort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "privateconstushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "privateconstint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "privateconstuint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "privateconstfloat4d",
-        NULL
-    },
-    {
-        "constant_vector8_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char8*", "constantchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar8*", "constantuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short8*", "constantshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort8*", "constantushort8p",
-        NULL
-    },
-    {
-        "constant_vector8_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int8*", "constantint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint8*", "constantuint8p",
-    NULL
-    },
-    {
-        "constant_vector8_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float8*", "constantfloat8p",
-        NULL
-    },
-    {
-        "constant_vector8_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "constantchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "constantuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "constantshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "constantushort8restrictp",
-        NULL
-    },
-    {
-        "constant_vector8_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "constantint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "constantuint8restrictp",
-    NULL
-    },
-    {
-        "constant_vector8_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "constantfloat8restrictp",
-        NULL
-    },
-    {
-        "global_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8*", "globalchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8*", "globaluchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8*", "globalshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8*", "globalushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8*", "globalint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8*", "globaluint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8*", "globalfloat8p",
-        NULL
-    },
-    {
-        "global_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globaluchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globaluint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalfloat8restrictp",
-        NULL
-    },
-    {
-        "global_const_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char8*", "globalconstchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar8*", "globalconstuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short8*", "globalconstshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort8*", "globalconstushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int8*", "globalconstint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint8*", "globalconstuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float8*", "globalconstfloat8p",
-        NULL
-    },
-    {
-        "global_const_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalconstchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globalconstuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalconstshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalconstushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalconstint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globalconstuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalconstfloat8restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "globalvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "globalvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "globalvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "globalvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "globalvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "globalvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "globalvolatilefloat8p",
-        NULL
-    },
-    {
-        "global_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globalvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globalvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "globalconstvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "globalconstvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "globalconstvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "globalconstvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "globalconstvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "globalconstvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "globalconstvolatilefloat8p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalconstvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globalconstvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalconstvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalconstvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalconstvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globalconstvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalconstvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "local_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8*", "localchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8*", "localuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8*", "localshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8*", "localushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8*", "localint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8*", "localuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8*", "localfloat8p",
-        NULL
-    },
-    {
-        "local_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localfloat8restrictp",
-        NULL
-    },
-    {
-        "local_const_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char8*", "localconstchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar8*", "localconstuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short8*", "localconstshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort8*", "localconstushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int8*", "localconstint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint8*", "localconstuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float8*", "localconstfloat8p",
-        NULL
-    },
-    {
-        "local_const_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localconstchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localconstuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localconstshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localconstushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localconstint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localconstuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localconstfloat8restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "localvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "localvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "localvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "localvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "localvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "localvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "localvolatilefloat8p",
-        NULL
-    },
-    {
-        "local_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "localconstvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "localconstvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "localconstvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "localconstvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "localconstvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "localconstvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "localconstvolatilefloat8p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localconstvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localconstvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localconstvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localconstvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localconstvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localconstvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localconstvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "char8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "uchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "short8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "ushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "int8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "uint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "float8d",
-        NULL
-    },
-    {
-        "const_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "constchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "constuchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "constshort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "constushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "constint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "constuint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "constfloat8d",
-        NULL
-    },
-    {
-        "private_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "privatechar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "privateuchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "privateshort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "privateushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "privateint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "privateuint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "privatefloat8d",
-        NULL
-    },
-    {
-        "private_const_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "privateconstchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "privateconstuchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "privateconstshort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "privateconstushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "privateconstint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "privateconstuint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "privateconstfloat8d",
-        NULL
-    },
-    {
-        "constant_vector16_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char16*", "constantchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar16*", "constantuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short16*", "constantshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort16*", "constantushort16p",
-        NULL
-    },
-    {
-        "constant_vector16_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int16*", "constantint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint16*", "constantuint16p",
-        NULL
-    },
-    {
-        "constant_vector16_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float16*", "constantfloat16p",
-        NULL
-    },
-    {
-        "constant_vector16_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "constantchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "constantuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "constantshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "constantushort16restrictp",
-        NULL
-    },
-    {
-        "constant_vector16_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "constantint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "constantuint16restrictp",
-        NULL
-    },
-    {
-        "constant_vector16_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "constantfloat16restrictp",
-        NULL
-    },
-    {
-        "global_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16*", "globalchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16*", "globaluchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16*", "globalshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16*", "globalushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16*", "globalint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16*", "globaluint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16*", "globalfloat16p",
-        NULL
-    },
-    {
-        "global_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globaluchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globaluint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalfloat16restrictp",
-        NULL
-    },
-    {
-        "global_const_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char16*", "globalconstchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar16*", "globalconstuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short16*", "globalconstshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort16*", "globalconstushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int16*", "globalconstint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint16*", "globalconstuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float16*", "globalconstfloat16p",
-        NULL
-    },
-    {
-        "global_const_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalconstchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globalconstuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalconstshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalconstushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalconstint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globalconstuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalconstfloat16restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "globalvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "globalvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "globalvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "globalvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "globalvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "globalvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "globalvolatilefloat16p",
-        NULL
-    },
-    {
-        "global_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globalvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globalvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "globalconstvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "globalconstvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "globalconstvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "globalconstvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "globalconstvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "globalconstvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "globalconstvolatilefloat16p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalconstvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globalconstvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalconstvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalconstvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalconstvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globalconstvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalconstvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "local_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16*", "localchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16*", "localuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16*", "localshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16*", "localushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16*", "localint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16*", "localuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16*", "localfloat16p",
-        NULL
-    },
-    {
-        "local_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localfloat16restrictp",
-        NULL
-    },
-    {
-        "local_const_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char16*", "localconstchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar16*", "localconstuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short16*", "localconstshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort16*", "localconstushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int16*", "localconstint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint16*", "localconstuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float16*", "localconstfloat16p",
-        NULL
-    },
-    {
-        "local_const_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localconstchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localconstuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localconstshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localconstushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localconstint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localconstuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localconstfloat16restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "localvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "localvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "localvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "localvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "localvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "localvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "localvolatilefloat16p",
-        NULL
-    },
-    {
-        "local_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "localconstvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "localconstvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "localconstvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "localconstvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "localconstvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "localconstvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "localconstvolatilefloat16p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localconstvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localconstvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localconstvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localconstvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localconstvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localconstvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localconstvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "char16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "uchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "short16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "ushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "int16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "uint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "float16d",
-        NULL
-    },
-    {
-        "const_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "constchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "constuchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "constshort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "constushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "constint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "constuint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "constfloat16d",
-        NULL
-    },
-    {
-        "private_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "privatechar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "privateuchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "privateshort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "privateushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "privateint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "privateuint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "privatefloat16d",
-        NULL
-    },
-    {
-        "private_const_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "privateconstchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "privateconstuchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "privateconstshort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "privateconstushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "privateconstint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "privateconstuint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "privateconstfloat16d",
-        NULL
-    },
-    {
-        "constant_derived_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_type*", "constanttypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "struct struct_type*", "constantstructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_struct_type*", "constanttypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "union union_type*", "constantunionunion_typep",
-        NULL
-    },
-    {
-        "constant_derived_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_union_type*", "constanttypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "enum enum_type*", "constantenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_enum_type*", "constanttypedef_enum_typep",
-        NULL
-    },
-    {
-        "constant_derived_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "constanttypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "constantstructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "constanttypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "constantunionunion_typerestrictp",
-        NULL
-    },
-    {
-        "constant_derived_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "constanttypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "constantenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "constanttypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type*", "globaltypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type*", "globalstructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type*", "globaltypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type*", "globalunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type*", "globaltypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type*", "globalenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type*", "globaltypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globaltypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalstructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globaltypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globaltypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globaltypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_const_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_type*", "globalconsttypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "struct struct_type*", "globalconststructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_struct_type*", "globalconsttypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "union union_type*", "globalconstunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_union_type*", "globalconsttypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "enum enum_type*", "globalconstenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_enum_type*", "globalconsttypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_const_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globalconsttypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalconststructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globalconsttypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalconstunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globalconsttypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalconstenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globalconsttypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "globalvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "globalvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "globalvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "globalvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "globalvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "globalvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "globalvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globalvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globalvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globalvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globalvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "globalconstvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "globalconstvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "globalconstvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "globalconstvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "globalconstvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "globalconstvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "globalconstvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_const_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globalconstvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalconstvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globalconstvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalconstvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globalconstvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalconstvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globalconstvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type*", "localtypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type*", "localstructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type*", "localtypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type*", "localunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type*", "localtypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type*", "localenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type*", "localtypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localtypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localstructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localtypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localtypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localtypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_const_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_type*", "localconsttypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "struct struct_type*", "localconststructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_struct_type*", "localconsttypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "union union_type*", "localconstunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_union_type*", "localconsttypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "enum enum_type*", "localconstenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_enum_type*", "localconsttypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_const_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localconsttypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localconststructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localconsttypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localconstunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localconsttypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localconstenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localconsttypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "localvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "localvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "localvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "localvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "localvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "localvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "localvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "localconstvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "localconstvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "localconstvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "localconstvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "localconstvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "localconstvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "localconstvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_const_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localconstvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localconstvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localconstvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localconstvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localconstvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localconstvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localconstvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "typedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "structstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "typedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "unionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "typedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "enumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "typedef_enum_typed",
-        NULL
-    },
-    {
-        "const_derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "consttypedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "conststructstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "consttypedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "constunionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "consttypedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "constenumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "consttypedef_enum_typed",
-        NULL
-    },
-    {
-        "private_derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "privatetypedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "privatestructstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "privatetypedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "privateunionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "privatetypedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "privateenumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "privatetypedef_enum_typed",
-        NULL
-    },
-    {
-        "private_const_derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "privateconsttypedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "privateconststructstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "privateconsttypedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "privateconstunionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "privateconsttypedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "privateconstenumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "privateconsttypedef_enum_typed",
-        NULL
-    },
-};
-
-// Support for optional image data type
-const char * image_kernel_args[] = {
-    "#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable \n"
-    "kernel void image_d(read_only image2d_t image2d_td0,\n"
-    "                    write_only image2d_t image2d_td1,\n"
-    "                    read_only image3d_t image3d_td2,\n"
-    "                    write_only image3d_t image3d_td3,\n"
-    "                    read_only image2d_array_t image2d_array_td4,\n"
-    "                    write_only image2d_array_t image2d_array_td5,\n"
-    "                    read_only image1d_t image1d_td6,\n"
-    "                    write_only image1d_t image1d_td7,\n"
-    "                    read_only image1d_buffer_t image1d_buffer_td8,\n"
-    "                    write_only image1d_buffer_t image1d_buffer_td9,\n"
-    "                    read_only image1d_array_t image1d_array_td10,\n"
-    "                    write_only image1d_array_t image1d_array_td11,\n"
-    "                    sampler_t sampler_td12)\n"
-    "{}\n",
-    "\n"
-};
-
-const char * image_arg_info[][67] = {
-    {
-        "image_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_t", "image2d_td0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_t", "image2d_td1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image3d_t", "image3d_td2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image3d_t", "image3d_td3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_array_t", "image2d_array_td4",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_array_t", "image2d_array_td5",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_t", "image1d_td6",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_t", "image1d_td7",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_buffer_t", "image1d_buffer_td8",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_buffer_t", "image1d_buffer_td9",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_array_t", "image1d_array_td10",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_array_t", "image1d_array_td11",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "sampler_t", "sampler_td12",
-        NULL
-    },
-};
-
-// Support for optional double data type
-const char * double_kernel_args[] = {
-    "kernel void double_scalar_p(constant double*constantdoublep,\n"
-    "                            constant double *restrict constantdoublerestrictp,\n"
-    "                            global double*globaldoublep,\n"
-    "                            global double *restrict globaldoublerestrictp,\n"
-    "                            global const double* globalconstdoublep,\n"
-    "                            global const double * restrict globalconstdoublerestrictp,\n"
-    "                            global volatile double*globalvolatiledoublep,\n"
-    "                            global volatile double *restrict globalvolatiledoublerestrictp,\n"
-    "                            global const volatile double* globalconstvolatiledoublep)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_scalar_p2(global const volatile double * restrict globalconstvolatiledoublerestrictp,\n"
-    "                             local double*localdoublep,\n"
-    "                             local double *restrict localdoublerestrictp,\n"
-    "                             local const double* localconstdoublep,\n"
-    "                             local const double * restrict localconstdoublerestrictp,\n"
-    "                             local volatile double*localvolatiledoublep,\n"
-    "                             local volatile double *restrict localvolatiledoublerestrictp,\n"
-    "                             local const volatile double* localconstvolatiledoublep,\n"
-    "                             local const volatile double * restrict localconstvolatiledoublerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_scalar_d(double doubled,\n"
-    "                            const double constdoubled,\n"
-    "                            private double privatedoubled,\n"
-    "                            private const double privateconstdoubled)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector2_p(constant double2*constantdouble2p,\n"
-    "                             constant double2 *restrict constantdouble2restrictp,\n"
-    "                             global double2*globaldouble2p,\n"
-    "                             global double2 *restrict globaldouble2restrictp,\n"
-    "                             global const double2* globalconstdouble2p,\n"
-    "                             global const double2 * restrict globalconstdouble2restrictp,\n"
-    "                             global volatile double2*globalvolatiledouble2p,\n"
-    "                             global volatile double2 *restrict globalvolatiledouble2restrictp,\n"
-    "                             global const volatile double2* globalconstvolatiledouble2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector2_p2(global const volatile double2 * restrict globalconstvolatiledouble2restrictp,\n"
-    "                              local double2*localdouble2p,\n"
-    "                              local double2 *restrict localdouble2restrictp,\n"
-    "                              local const double2* localconstdouble2p,\n"
-    "                              local const double2 * restrict localconstdouble2restrictp,\n"
-    "                              local volatile double2*localvolatiledouble2p,\n"
-    "                              local volatile double2 *restrict localvolatiledouble2restrictp,\n"
-    "                              local const volatile double2* localconstvolatiledouble2p,\n"
-    "                              local const volatile double2 * restrict localconstvolatiledouble2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector2_d(double2 double2d,\n"
-    "                             const double2 constdouble2d,\n"
-    "                             private double2 privatedouble2d,\n"
-    "                             private const double2 privateconstdouble2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector3_p(constant double3*constantdouble3p,\n"
-    "                             constant double3 *restrict constantdouble3restrictp,\n"
-    "                             global double3*globaldouble3p,\n"
-    "                             global double3 *restrict globaldouble3restrictp,\n"
-    "                             global const double3* globalconstdouble3p,\n"
-    "                             global const double3 * restrict globalconstdouble3restrictp,\n"
-    "                             global volatile double3*globalvolatiledouble3p,\n"
-    "                             global volatile double3 *restrict globalvolatiledouble3restrictp,\n"
-    "                             global const volatile double3* globalconstvolatiledouble3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector3_p2(global const volatile double3 * restrict globalconstvolatiledouble3restrictp,\n"
-    "                              local double3*localdouble3p,\n"
-    "                              local double3 *restrict localdouble3restrictp,\n"
-    "                              local const double3* localconstdouble3p,\n"
-    "                              local const double3 * restrict localconstdouble3restrictp,\n"
-    "                              local volatile double3*localvolatiledouble3p,\n"
-    "                              local volatile double3 *restrict localvolatiledouble3restrictp,\n"
-    "                              local const volatile double3* localconstvolatiledouble3p,\n"
-    "                              local const volatile double3 * restrict localconstvolatiledouble3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector3_d(double3 double3d,\n"
-    "                             const double3 constdouble3d,\n"
-    "                             private double3 privatedouble3d,\n"
-    "                             private const double3 privateconstdouble3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector4_p(constant double4*constantdouble4p,\n"
-    "                             constant double4 *restrict constantdouble4restrictp,\n"
-    "                             global double4*globaldouble4p,\n"
-    "                             global double4 *restrict globaldouble4restrictp,\n"
-    "                             global const double4* globalconstdouble4p,\n"
-    "                             global const double4 * restrict globalconstdouble4restrictp,\n"
-    "                             global volatile double4*globalvolatiledouble4p,\n"
-    "                             global volatile double4 *restrict globalvolatiledouble4restrictp,\n"
-    "                             global const volatile double4* globalconstvolatiledouble4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector4_p2(global const volatile double4 * restrict globalconstvolatiledouble4restrictp,\n"
-    "                              local double4*localdouble4p,\n"
-    "                              local double4 *restrict localdouble4restrictp,\n"
-    "                              local const double4* localconstdouble4p,\n"
-    "                              local const double4 * restrict localconstdouble4restrictp,\n"
-    "                              local volatile double4*localvolatiledouble4p,\n"
-    "                              local volatile double4 *restrict localvolatiledouble4restrictp,\n"
-    "                              local const volatile double4* localconstvolatiledouble4p,\n"
-    "                              local const volatile double4 * restrict localconstvolatiledouble4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector4_d(double4 double4d,\n"
-    "                             const double4 constdouble4d,\n"
-    "                             private double4 privatedouble4d,\n"
-    "                             private const double4 privateconstdouble4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector8_p(constant double8*constantdouble8p,\n"
-    "                             constant double8 *restrict constantdouble8restrictp,\n"
-    "                             global double8*globaldouble8p,\n"
-    "                             global double8 *restrict globaldouble8restrictp,\n"
-    "                             global const double8* globalconstdouble8p,\n"
-    "                             global const double8 * restrict globalconstdouble8restrictp,\n"
-    "                             global volatile double8*globalvolatiledouble8p,\n"
-    "                             global volatile double8 *restrict globalvolatiledouble8restrictp,\n"
-    "                             global const volatile double8* globalconstvolatiledouble8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector8_p2(global const volatile double8 * restrict globalconstvolatiledouble8restrictp,\n"
-    "                              local double8*localdouble8p,\n"
-    "                              local double8 *restrict localdouble8restrictp,\n"
-    "                              local const double8* localconstdouble8p,\n"
-    "                              local const double8 * restrict localconstdouble8restrictp,\n"
-    "                              local volatile double8*localvolatiledouble8p,\n"
-    "                              local volatile double8 *restrict localvolatiledouble8restrictp,\n"
-    "                              local const volatile double8* localconstvolatiledouble8p,\n"
-    "                              local const volatile double8 * restrict localconstvolatiledouble8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector8_d(double8 double8d,\n"
-    "                             const double8 constdouble8d,\n"
-    "                             private double8 privatedouble8d,\n"
-    "                             private const double8 privateconstdouble8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector16_p(constant double16*constantdouble16p,\n"
-    "                              constant double16 *restrict constantdouble16restrictp,\n"
-    "                              global double16*globaldouble16p,\n"
-    "                              global double16 *restrict globaldouble16restrictp,\n"
-    "                              global const double16* globalconstdouble16p,\n"
-    "                              global const double16 * restrict globalconstdouble16restrictp,\n"
-    "                              global volatile double16*globalvolatiledouble16p,\n"
-    "                              global volatile double16 *restrict globalvolatiledouble16restrictp,\n"
-    "                              global const volatile double16* globalconstvolatiledouble16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector16_p2(global const volatile double16 * restrict globalconstvolatiledouble16restrictp,\n"
-    "                               local double16*localdouble16p,\n"
-    "                               local double16 *restrict localdouble16restrictp,\n"
-    "                               local const double16* localconstdouble16p,\n"
-    "                               local const double16 * restrict localconstdouble16restrictp,\n"
-    "                               local volatile double16*localvolatiledouble16p,\n"
-    "                               local volatile double16 *restrict localvolatiledouble16restrictp,\n"
-    "                               local const volatile double16* localconstvolatiledouble16p,\n"
-    "                               local const volatile double16 * restrict localconstvolatiledouble16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector16_d(double16 double16d,\n"
-    "                              const double16 constdouble16d,\n"
-    "                              private double16 privatedouble16d,\n"
-    "                              private const double16 privateconstdouble16d)\n"
-    "{}\n",
-    "\n"
-};
-
-// Support for optional 3D image data type
-const char * image_3D_kernel_args[] = {
-    "#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable \n"
-    "kernel void image_d(read_only image3d_t image3d_td2,\n"
-    "                    write_only image3d_t image3d_td3)\n"
-    "{}\n",
-    "\n"
-};
-
-const char * image_3D_arg_info[][67] = {
-    {
-        "image_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image3d_t", "image3d_td2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image3d_t", "image3d_td3",
-        NULL
-    },
-};
-
-const char * double_arg_info[][77] = {
-    {
-        "double_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double*", "constantdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "constantdoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double*", "globaldoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globaldoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double*", "globalconstdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globalconstdoublerestrictp",
-    (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "globalvolatiledoublep",
-    (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globalvolatiledoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "globalconstvolatiledoublep",
-        NULL
-    },
-    {
-        "double_scalar_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globalconstvolatiledoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double*", "localdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localdoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double*", "localconstdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localconstdoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "localvolatiledoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localvolatiledoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "localconstvolatiledoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localconstvolatiledoublerestrictp",
-        NULL
-    },
-    {
-        "double_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "doubled",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "constdoubled",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "privatedoubled",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "privateconstdoubled",
-        NULL
-    },
-    {
-        "double_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double2*", "constantdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "constantdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2*", "globaldouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globaldouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double2*", "globalconstdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globalconstdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "globalvolatiledouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globalvolatiledouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "globalconstvolatiledouble2p",
-        NULL
-    },
-    {
-        "double_vector2_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globalconstvolatiledouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2*", "localdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double2*", "localconstdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localconstdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "localvolatiledouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localvolatiledouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "localconstvolatiledouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localconstvolatiledouble2restrictp",
-        NULL
-    },
-    {
-        "double_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "double2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "constdouble2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "privatedouble2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "privateconstdouble2d",
-        NULL
-    },
-    {
-        "double_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double3*", "constantdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "constantdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3*", "globaldouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globaldouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double3*", "globalconstdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globalconstdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "globalvolatiledouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globalvolatiledouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "globalconstvolatiledouble3p",
-        NULL
-    },
-    {
-        "double_vector3_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globalconstvolatiledouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3*", "localdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double3*", "localconstdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localconstdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "localvolatiledouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localvolatiledouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "localconstvolatiledouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localconstvolatiledouble3restrictp",
-        NULL
-    },
-    {
-        "double_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "double3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "constdouble3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "privatedouble3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "privateconstdouble3d",
-        NULL
-    },
-    {
-        "double_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double4*", "constantdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "constantdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4*", "globaldouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globaldouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double4*", "globalconstdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globalconstdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "globalvolatiledouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globalvolatiledouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "globalconstvolatiledouble4p",
-        NULL
-    },
-    {
-        "double_vector4_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globalconstvolatiledouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4*", "localdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double4*", "localconstdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localconstdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "localvolatiledouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localvolatiledouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "localconstvolatiledouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localconstvolatiledouble4restrictp",
-        NULL
-    },
-    {
-        "double_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "double4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "constdouble4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "privatedouble4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "privateconstdouble4d",
-        NULL
-    },
-    {
-        "double_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double8*", "constantdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "constantdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8*", "globaldouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globaldouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double8*", "globalconstdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globalconstdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "globalvolatiledouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globalvolatiledouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "globalconstvolatiledouble8p",
-        NULL
-    },
-    {
-        "double_vector8_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globalconstvolatiledouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8*", "localdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double8*", "localconstdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localconstdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "localvolatiledouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localvolatiledouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "localconstvolatiledouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localconstvolatiledouble8restrictp",
-        NULL
-    },
-    {
-        "double_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "double8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "constdouble8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "privatedouble8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "privateconstdouble8d",
-        NULL
-    },
-    {
-        "double_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double16*", "constantdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "constantdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16*", "globaldouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globaldouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double16*", "globalconstdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globalconstdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "globalvolatiledouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globalvolatiledouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "globalconstvolatiledouble16p",
-        NULL
-    },
-    {
-        "double_vector16_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globalconstvolatiledouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16*", "localdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double16*", "localconstdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localconstdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "localvolatiledouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localvolatiledouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "localconstvolatiledouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localconstvolatiledouble16restrictp",
-        NULL
-    },
-    {
-        "double_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "double16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "constdouble16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "privatedouble16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "privateconstdouble16d",
-        NULL
-    },
-};
-
-
-// Support for optional half data type
-const char * half_kernel_args[] = {
-    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-    "\n"
-    "kernel void half_scalar_p(constant half*constanthalfp,\n"
-    "                          constant half *restrict constanthalfrestrictp,\n"
-    "                          global half*globalhalfp,\n"
-    "                          global half *restrict globalhalfrestrictp,\n"
-    "                          global const half* globalconsthalfp,\n"
-    "                          global const half * restrict globalconsthalfrestrictp,\n"
-    "                          global volatile half*globalvolatilehalfp,\n"
-    "                          global volatile half *restrict globalvolatilehalfrestrictp,\n"
-    "                          global const volatile half* globalconstvolatilehalfp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_scalar_p2(global const volatile half * restrict globalconstvolatilehalfrestrictp,\n"
-    "                           local half*localhalfp,\n"
-    "                           local half *restrict localhalfrestrictp,\n"
-    "                           local const half* localconsthalfp,\n"
-    "                           local const half * restrict localconsthalfrestrictp,\n"
-    "                           local volatile half*localvolatilehalfp,\n"
-    "                           local volatile half *restrict localvolatilehalfrestrictp,\n"
-    "                           local const volatile half* localconstvolatilehalfp,\n"
-    "                           local const volatile half * restrict localconstvolatilehalfrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector2_p(constant half2*constanthalf2p,\n"
-    "                           constant half2 *restrict constanthalf2restrictp,\n"
-    "                           global half2*globalhalf2p,\n"
-    "                           global half2 *restrict globalhalf2restrictp,\n"
-    "                           global const half2* globalconsthalf2p,\n"
-    "                           global const half2 * restrict globalconsthalf2restrictp,\n"
-    "                           global volatile half2*globalvolatilehalf2p,\n"
-    "                           global volatile half2 *restrict globalvolatilehalf2restrictp,\n"
-    "                           global const volatile half2* globalconstvolatilehalf2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector2_p2(global const volatile half2 * restrict globalconstvolatilehalf2restrictp,\n"
-    "                            local half2*localhalf2p,\n"
-    "                            local half2 *restrict localhalf2restrictp,\n"
-    "                            local const half2* localconsthalf2p,\n"
-    "                            local const half2 * restrict localconsthalf2restrictp,\n"
-    "                            local volatile half2*localvolatilehalf2p,\n"
-    "                            local volatile half2 *restrict localvolatilehalf2restrictp,\n"
-    "                            local const volatile half2* localconstvolatilehalf2p,\n"
-    "                            local const volatile half2 * restrict localconstvolatilehalf2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector3_p(constant half3*constanthalf3p,\n"
-    "                           constant half3 *restrict constanthalf3restrictp,\n"
-    "                           global half3*globalhalf3p,\n"
-    "                           global half3 *restrict globalhalf3restrictp,\n"
-    "                           global const half3* globalconsthalf3p,\n"
-    "                           global const half3 * restrict globalconsthalf3restrictp,\n"
-    "                           global volatile half3*globalvolatilehalf3p,\n"
-    "                           global volatile half3 *restrict globalvolatilehalf3restrictp,\n"
-    "                           global const volatile half3* globalconstvolatilehalf3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector3_p2(global const volatile half3 * restrict globalconstvolatilehalf3restrictp,\n"
-    "                            local half3*localhalf3p,\n"
-    "                            local half3 *restrict localhalf3restrictp,\n"
-    "                            local const half3* localconsthalf3p,\n"
-    "                            local const half3 * restrict localconsthalf3restrictp,\n"
-    "                            local volatile half3*localvolatilehalf3p,\n"
-    "                            local volatile half3 *restrict localvolatilehalf3restrictp,\n"
-    "                            local const volatile half3* localconstvolatilehalf3p,\n"
-    "                            local const volatile half3 * restrict localconstvolatilehalf3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector4_p(constant half4*constanthalf4p,\n"
-    "                           constant half4 *restrict constanthalf4restrictp,\n"
-    "                           global half4*globalhalf4p,\n"
-    "                           global half4 *restrict globalhalf4restrictp,\n"
-    "                           global const half4* globalconsthalf4p,\n"
-    "                           global const half4 * restrict globalconsthalf4restrictp,\n"
-    "                           global volatile half4*globalvolatilehalf4p,\n"
-    "                           global volatile half4 *restrict globalvolatilehalf4restrictp,\n"
-    "                           global const volatile half4* globalconstvolatilehalf4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector4_p2(global const volatile half4 * restrict globalconstvolatilehalf4restrictp,\n"
-    "                            local half4*localhalf4p,\n"
-    "                            local half4 *restrict localhalf4restrictp,\n"
-    "                            local const half4* localconsthalf4p,\n"
-    "                            local const half4 * restrict localconsthalf4restrictp,\n"
-    "                            local volatile half4*localvolatilehalf4p,\n"
-    "                            local volatile half4 *restrict localvolatilehalf4restrictp,\n"
-    "                            local const volatile half4* localconstvolatilehalf4p,\n"
-    "                            local const volatile half4 * restrict localconstvolatilehalf4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector8_p(constant half8*constanthalf8p,\n"
-    "                           constant half8 *restrict constanthalf8restrictp,\n"
-    "                           global half8*globalhalf8p,\n"
-    "                           global half8 *restrict globalhalf8restrictp,\n"
-    "                           global const half8* globalconsthalf8p,\n"
-    "                           global const half8 * restrict globalconsthalf8restrictp,\n"
-    "                           global volatile half8*globalvolatilehalf8p,\n"
-    "                           global volatile half8 *restrict globalvolatilehalf8restrictp,\n"
-    "                           global const volatile half8* globalconstvolatilehalf8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector8_p2(global const volatile half8 * restrict globalconstvolatilehalf8restrictp,\n"
-    "                            local half8*localhalf8p,\n"
-    "                            local half8 *restrict localhalf8restrictp,\n"
-    "                            local const half8* localconsthalf8p,\n"
-    "                            local const half8 * restrict localconsthalf8restrictp,\n"
-    "                            local volatile half8*localvolatilehalf8p,\n"
-    "                            local volatile half8 *restrict localvolatilehalf8restrictp,\n"
-    "                            local const volatile half8* localconstvolatilehalf8p,\n"
-    "                            local const volatile half8 * restrict localconstvolatilehalf8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector16_p(constant half16*constanthalf16p,\n"
-    "                            constant half16 *restrict constanthalf16restrictp,\n"
-    "                            global half16*globalhalf16p,\n"
-    "                            global half16 *restrict globalhalf16restrictp,\n"
-    "                            global const half16* globalconsthalf16p,\n"
-    "                            global const half16 * restrict globalconsthalf16restrictp,\n"
-    "                            global volatile half16*globalvolatilehalf16p,\n"
-    "                            global volatile half16 *restrict globalvolatilehalf16restrictp,\n"
-    "                            global const volatile half16* globalconstvolatilehalf16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector16_p2(global const volatile half16 * restrict globalconstvolatilehalf16restrictp,\n"
-    "                             local half16*localhalf16p,\n"
-    "                             local half16 *restrict localhalf16restrictp,\n"
-    "                             local const half16* localconsthalf16p,\n"
-    "                             local const half16 * restrict localconsthalf16restrictp,\n"
-    "                             local volatile half16*localvolatilehalf16p,\n"
-    "                             local volatile half16 *restrict localvolatilehalf16restrictp,\n"
-    "                             local const volatile half16* localconstvolatilehalf16p,\n"
-    "                             local const volatile half16 * restrict localconstvolatilehalf16restrictp)\n"
-    "{}\n",
-    "\n"
-};
-
-const char * half_arg_info[][77] = {
-    {
-        "half_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half*", "constanthalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "constanthalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half*", "globalhalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalhalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half*", "globalconsthalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalconsthalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "globalvolatilehalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalvolatilehalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "globalconstvolatilehalfp",
-        NULL
-    },
-    {
-        "half_scalar_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalconstvolatilehalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half*", "localhalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localhalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half*", "localconsthalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localconsthalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "localvolatilehalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localvolatilehalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "localconstvolatilehalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localconstvolatilehalfrestrictp",
-        NULL
-    },
-    {
-        "half_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half2*", "constanthalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "constanthalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2*", "globalhalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalhalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half2*", "globalconsthalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalconsthalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "globalvolatilehalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalvolatilehalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "globalconstvolatilehalf2p",
-        NULL
-    },
-    {
-        "half_vector2_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalconstvolatilehalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2*", "localhalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localhalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half2*", "localconsthalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localconsthalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "localvolatilehalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localvolatilehalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "localconstvolatilehalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localconstvolatilehalf2restrictp",
-        NULL
-    },
-    {
-        "half_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half3*", "constanthalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "constanthalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3*", "globalhalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalhalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half3*", "globalconsthalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalconsthalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "globalvolatilehalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalvolatilehalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "globalconstvolatilehalf3p",
-        NULL
-    },
-    {
-        "half_vector3_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalconstvolatilehalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3*", "localhalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localhalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half3*", "localconsthalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localconsthalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "localvolatilehalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localvolatilehalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "localconstvolatilehalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localconstvolatilehalf3restrictp",
-        NULL
-    },
-    {
-        "half_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half4*", "constanthalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "constanthalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4*", "globalhalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalhalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half4*", "globalconsthalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalconsthalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "globalvolatilehalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalvolatilehalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "globalconstvolatilehalf4p",
-        NULL
-    },
-    {
-        "half_vector4_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalconstvolatilehalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4*", "localhalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localhalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half4*", "localconsthalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localconsthalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "localvolatilehalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localvolatilehalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "localconstvolatilehalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localconstvolatilehalf4restrictp",
-        NULL
-    },
-    {
-        "half_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half8*", "constanthalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "constanthalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8*", "globalhalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalhalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half8*", "globalconsthalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalconsthalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "globalvolatilehalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalvolatilehalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "globalconstvolatilehalf8p",
-        NULL
-    },
-    {
-        "half_vector8_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalconstvolatilehalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8*", "localhalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localhalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half8*", "localconsthalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localconsthalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "localvolatilehalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localvolatilehalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "localconstvolatilehalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localconstvolatilehalf8restrictp",
-        NULL
-    },
-    {
-        "half_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half16*", "constanthalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "constanthalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16*", "globalhalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalhalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half16*", "globalconsthalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalconsthalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "globalvolatilehalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalvolatilehalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "globalconstvolatilehalf16p",
-        NULL
-    },
-    {
-        "half_vector16_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalconstvolatilehalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16*", "localhalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localhalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half16*", "localconsthalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localconsthalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "localvolatilehalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localvolatilehalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "localconstvolatilehalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localconstvolatilehalf16restrictp",
-        NULL
-    },
-};
-
-const char * long_kernel_args[] = {
-        "kernel void constant_scalar_p2(constant long* constantlongp,\n"
-        "                              constant ulong * constantulongp)\n"
-      "{}\n",
-        "kernel void constant_scalar_p3(constant unsigned long*constantunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_scalar_restrict_p2(constant long*restrict constantlongrestrictp,\n"
-        "                                       constant ulong *restrict constantulongrestrictp)\n"
-        "{}\n",
-        "kernel void constant_scalar_restrict_p3(constant unsigned long* restrict constantunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_scalar_p(global long* globallongp,\n"
-        "                            global ulong * globalulongp,\n"
-        "                            global unsigned long*globalunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_scalar_restrict_p(global long*restrict globallongrestrictp,\n"
-        "                                     global ulong *restrict globalulongrestrictp,\n"
-        "                                     global unsigned long* restrict globalunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_scalar_p(global const long* globalconstlongp,\n"
-        "                                  global const ulong * globalconstulongp,\n"
-        "                                  global const unsigned long*globalconstunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_scalar_restrict_p(global const long*restrict globalconstlongrestrictp,\n"
-        "                                           global const ulong *restrict globalconstulongrestrictp,\n"
-        "                                           global const unsigned long* restrict globalconstunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_scalar_p(global volatile long* globalvolatilelongp,\n"
-        "                                     global volatile ulong * globalvolatileulongp,\n"
-        "                                     global volatile unsigned long*globalvolatileunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_scalar_restrict_p(global volatile long*restrict globalvolatilelongrestrictp,\n"
-        "                                              global volatile ulong *restrict globalvolatileulongrestrictp,\n"
-        "                                              global volatile unsigned long* restrict globalvolatileunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_scalar_p(global const volatile long* globalconstvolatilelongp,\n"
-        "                                           global const volatile ulong * globalconstvolatileulongp,\n"
-        "                                           global const volatile unsigned long*globalconstvolatileunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_scalar_restrict_p(global const volatile long*restrict globalconstvolatilelongrestrictp,\n"
-        "                                                    global const volatile ulong *restrict globalconstvolatileulongrestrictp,\n"
-        "                                                    global const volatile unsigned long* restrict globalconstvolatileunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_scalar_p(local long* locallongp,\n"
-        "                           local ulong * localulongp,\n"
-        "                           local unsigned long*localunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_scalar_restrict_p(local long*restrict locallongrestrictp,\n"
-        "                                    local ulong *restrict localulongrestrictp,\n"
-        "                                    local unsigned long* restrict localunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_scalar_p(local const long* localconstlongp,\n"
-        "                                 local const ulong * localconstulongp,\n"
-        "                                 local const unsigned long*localconstunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_scalar_restrict_p(local const long*restrict localconstlongrestrictp,\n"
-        "                                          local const ulong *restrict localconstulongrestrictp,\n"
-        "                                          local const unsigned long* restrict localconstunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_scalar_p(local volatile long* localvolatilelongp,\n"
-        "                                    local volatile ulong * localvolatileulongp,\n"
-        "                                    local volatile unsigned long*localvolatileunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_scalar_restrict_p(local volatile long*restrict localvolatilelongrestrictp,\n"
-        "                                             local volatile ulong *restrict localvolatileulongrestrictp,\n"
-        "                                             local volatile unsigned long* restrict localvolatileunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_scalar_p(local const volatile long* localconstvolatilelongp,\n"
-        "                                          local const volatile ulong * localconstvolatileulongp,\n"
-        "                                          local const volatile unsigned long*localconstvolatileunsignedlongp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_scalar_restrict_p(local const volatile long*restrict localconstvolatilelongrestrictp,\n"
-        "                                                   local const volatile ulong *restrict localconstvolatileulongrestrictp,\n"
-        "                                                   local const volatile unsigned long* restrict localconstvolatileunsignedlongrestrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void scalar_d(long longd,\n"
-        "                     ulong ulongd,\n"
-        "                     unsigned long unsignedlongd)\n"
-        "{}\n",
-        "\n"
-        "kernel void const_scalar_d(const long constlongd,\n"
-        "                           const ulong constulongd,\n"
-        "                           const unsigned long constunsignedlongd)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_scalar_d(private long privatelongd,\n"
-        "                             private ulong privateulongd,\n"
-        "                             private unsigned long privateunsignedlongd)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_const_scalar_d(private const long privateconstlongd,\n"
-        "                                   private const ulong privateconstulongd,\n"
-        "                                   private const unsigned long privateconstunsignedlongd)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector2_p1(constant long2* constantlong2p,\n"
-        "                               constant ulong2 * constantulong2p)\n"
-      "{}\n",
-        "\n"
-        "kernel void constant_vector2_restrict_p1(constant long2 * restrict constantlong2restrictp,\n"
-        "                                        constant ulong2*restrict constantulong2restrictp)\n"
-      "{}\n",
-        "\n"
-        "kernel void global_vector2_p(global long2* globallong2p,\n"
-        "                             global ulong2 * globalulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector2_restrict_p(global long2 * restrict globallong2restrictp,\n"
-        "                                      global ulong2*restrict globalulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector2_p(global const long2*globalconstlong2p,\n"
-        "                                   global const ulong2 *globalconstulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector2_restrict_p(global const long2 *restrict globalconstlong2restrictp,\n"
-        "                                            global const ulong2* restrict globalconstulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector2_p(global volatile long2* globalvolatilelong2p,\n"
-        "                                      global volatile ulong2 * globalvolatileulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector2_restrict_p(global volatile long2 * restrict globalvolatilelong2restrictp,\n"
-        "                                               global volatile ulong2*restrict globalvolatileulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector2_p(global const volatile long2*globalconstvolatilelong2p,\n"
-        "                                            global const volatile ulong2 *globalconstvolatileulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector2_restrict_p(global const volatile long2 *restrict globalconstvolatilelong2restrictp,\n"
-        "                                                     global const volatile ulong2* restrict globalconstvolatileulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector2_p(local long2* locallong2p,\n"
-        "                            local ulong2 * localulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector2_restrict_p(local long2 * restrict locallong2restrictp,\n"
-        "                                     local ulong2*restrict localulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector2_p(local const long2*localconstlong2p,\n"
-        "                                  local const ulong2 *localconstulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector2_restrict_p(local const long2 *restrict localconstlong2restrictp,\n"
-        "                                           local const ulong2* restrict localconstulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector2_p(local volatile long2* localvolatilelong2p,\n"
-        "                                     local volatile ulong2 * localvolatileulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector2_restrict_p(local volatile long2 * restrict localvolatilelong2restrictp,\n"
-        "                                              local volatile ulong2*restrict localvolatileulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector2_p(local const volatile long2*localconstvolatilelong2p,\n"
-        "                                           local const volatile ulong2 *localconstvolatileulong2p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector2_restrict_p(local const volatile long2 *restrict localconstvolatilelong2restrictp,\n"
-        "                                                    local const volatile ulong2* restrict localconstvolatileulong2restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void vector2_d(long2 long2d,\n"
-        "                      ulong2 ulong2d)\n"
-        "{}\n",
-        "\n"
-        "kernel void const_vector2_d(const long2 constlong2d,\n"
-        "                            const ulong2 constulong2d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_vector2_d(private long2 privatelong2d,\n"
-        "                              private ulong2 privateulong2d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_const_vector2_d(private const long2 privateconstlong2d,\n"
-        "                                    private const ulong2 privateconstulong2d)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector3_p1(constant long3* constantlong3p,\n"
-        "                               constant ulong3 * constantulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector3_restrict_p1(constant long3 * restrict constantlong3restrictp,\n"
-        "                                        constant ulong3*restrict constantulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector3_p(global long3* globallong3p,\n"
-        "                             global ulong3 * globalulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector3_restrict_p(global long3 * restrict globallong3restrictp,\n"
-        "                                      global ulong3*restrict globalulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector3_p(global const long3*globalconstlong3p,\n"
-        "                                   global const ulong3 *globalconstulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector3_restrict_p(global const long3 *restrict globalconstlong3restrictp,\n"
-        "                                            global const ulong3* restrict globalconstulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector3_p(global volatile long3* globalvolatilelong3p,\n"
-        "                                      global volatile ulong3 * globalvolatileulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector3_restrict_p(global volatile long3 * restrict globalvolatilelong3restrictp,\n"
-        "                                               global volatile ulong3*restrict globalvolatileulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector3_p(global const volatile long3*globalconstvolatilelong3p,\n"
-        "                                            global const volatile ulong3 *globalconstvolatileulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector3_restrict_p(global const volatile long3 *restrict globalconstvolatilelong3restrictp,\n"
-        "                                                     global const volatile ulong3* restrict globalconstvolatileulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector3_p(local long3* locallong3p,\n"
-        "                            local ulong3 * localulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector3_restrict_p(local long3 * restrict locallong3restrictp,\n"
-        "                                     local ulong3*restrict localulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector3_p(local const long3*localconstlong3p,\n"
-        "                                  local const ulong3 *localconstulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector3_restrict_p(local const long3 *restrict localconstlong3restrictp,\n"
-        "                                           local const ulong3* restrict localconstulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector3_p(local volatile long3* localvolatilelong3p,\n"
-        "                                     local volatile ulong3 * localvolatileulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector3_restrict_p(local volatile long3 * restrict localvolatilelong3restrictp,\n"
-        "                                              local volatile ulong3*restrict localvolatileulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector3_p(local const volatile long3*localconstvolatilelong3p,\n"
-        "                                           local const volatile ulong3 *localconstvolatileulong3p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector3_restrict_p(local const volatile long3 *restrict localconstvolatilelong3restrictp,\n"
-        "                                                    local const volatile ulong3* restrict localconstvolatileulong3restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void vector3_d(long3 long3d,\n"
-        "                      ulong3 ulong3d)\n"
-        "{}\n",
-        "\n"
-        "kernel void const_vector3_d(const long3 constlong3d,\n"
-        "                            const ulong3 constulong3d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_vector3_d(private long3 privatelong3d,\n"
-        "                              private ulong3 privateulong3d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_const_vector3_d(private const long3 privateconstlong3d,\n"
-        "                                    private const ulong3 privateconstulong3d)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector4_p1(constant long4* constantlong4p,\n"
-        "                               constant ulong4 * constantulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector4_restrict_p1(constant long4 * restrict constantlong4restrictp,\n"
-        "                                        constant ulong4*restrict constantulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector4_p(global long4* globallong4p,\n"
-        "                             global ulong4 * globalulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector4_restrict_p(global long4 * restrict globallong4restrictp,\n"
-        "                                      global ulong4*restrict globalulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector4_p(global const long4*globalconstlong4p,\n"
-        "                                   global const ulong4 *globalconstulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector4_restrict_p(global const long4 *restrict globalconstlong4restrictp,\n"
-        "                                            global const ulong4* restrict globalconstulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector4_p(global volatile long4* globalvolatilelong4p,\n"
-        "                                      global volatile ulong4 * globalvolatileulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector4_restrict_p(global volatile long4 * restrict globalvolatilelong4restrictp,\n"
-        "                                               global volatile ulong4*restrict globalvolatileulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector4_p(global const volatile long4*globalconstvolatilelong4p,\n"
-        "                                            global const volatile ulong4 *globalconstvolatileulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector4_restrict_p(global const volatile long4 *restrict globalconstvolatilelong4restrictp,\n"
-        "                                                     global const volatile ulong4* restrict globalconstvolatileulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector4_p(local long4* locallong4p,\n"
-        "                            local ulong4 * localulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector4_restrict_p(local long4 * restrict locallong4restrictp,\n"
-        "                                     local ulong4*restrict localulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector4_p(local const long4*localconstlong4p,\n"
-        "                                  local const ulong4 *localconstulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector4_restrict_p(local const long4 *restrict localconstlong4restrictp,\n"
-        "                                           local const ulong4* restrict localconstulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector4_p(local volatile long4* localvolatilelong4p,\n"
-        "                                     local volatile ulong4 * localvolatileulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector4_restrict_p(local volatile long4 * restrict localvolatilelong4restrictp,\n"
-        "                                              local volatile ulong4*restrict localvolatileulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector4_p(local const volatile long4*localconstvolatilelong4p,\n"
-        "                                           local const volatile ulong4 *localconstvolatileulong4p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector4_restrict_p(local const volatile long4 *restrict localconstvolatilelong4restrictp,\n"
-        "                                                    local const volatile ulong4* restrict localconstvolatileulong4restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void vector4_d(long4 long4d,\n"
-        "                      ulong4 ulong4d)\n"
-        "{}\n",
-        "\n"
-        "kernel void const_vector4_d(const long4 constlong4d,\n"
-        "                            const ulong4 constulong4d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_vector4_d(private long4 privatelong4d,\n"
-        "                              private ulong4 privateulong4d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_const_vector4_d(private const long4 privateconstlong4d,\n"
-        "                                    private const ulong4 privateconstulong4d)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector8_p1(constant long8* constantlong8p,\n"
-        "                               constant ulong8 * constantulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector8_restrict_p1(constant long8 * restrict constantlong8restrictp,\n"
-        "                                        constant ulong8*restrict constantulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector8_p(global long8* globallong8p,\n"
-        "                             global ulong8 * globalulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector8_restrict_p(global long8 * restrict globallong8restrictp,\n"
-        "                                      global ulong8*restrict globalulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector8_p(global const long8*globalconstlong8p,\n"
-        "                                   global const ulong8 *globalconstulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector8_restrict_p(global const long8 *restrict globalconstlong8restrictp,\n"
-        "                                            global const ulong8* restrict globalconstulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector8_p(global volatile long8* globalvolatilelong8p,\n"
-        "                                      global volatile ulong8 * globalvolatileulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector8_restrict_p(global volatile long8 * restrict globalvolatilelong8restrictp,\n"
-        "                                               global volatile ulong8*restrict globalvolatileulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector8_p(global const volatile long8*globalconstvolatilelong8p,\n"
-        "                                            global const volatile ulong8 *globalconstvolatileulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector8_restrict_p(global const volatile long8 *restrict globalconstvolatilelong8restrictp,\n"
-        "                                                     global const volatile ulong8* restrict globalconstvolatileulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector8_p(local long8* locallong8p,\n"
-        "                            local ulong8 * localulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector8_restrict_p(local long8 * restrict locallong8restrictp,\n"
-        "                                     local ulong8*restrict localulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector8_p(local const long8*localconstlong8p,\n"
-        "                                  local const ulong8 *localconstulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector8_restrict_p(local const long8 *restrict localconstlong8restrictp,\n"
-        "                                           local const ulong8* restrict localconstulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector8_p(local volatile long8* localvolatilelong8p,\n"
-        "                                     local volatile ulong8 * localvolatileulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector8_restrict_p(local volatile long8 * restrict localvolatilelong8restrictp,\n"
-        "                                              local volatile ulong8*restrict localvolatileulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector8_p(local const volatile long8*localconstvolatilelong8p,\n"
-        "                                           local const volatile ulong8 *localconstvolatileulong8p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector8_restrict_p(local const volatile long8 *restrict localconstvolatilelong8restrictp,\n"
-        "                                                    local const volatile ulong8* restrict localconstvolatileulong8restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void vector8_d(long8 long8d,\n"
-        "                      ulong8 ulong8d)\n"
-        "{}\n",
-        "\n"
-        "kernel void const_vector8_d(const long8 constlong8d,\n"
-        "                            const ulong8 constulong8d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_vector8_d(private long8 privatelong8d,\n"
-        "                              private ulong8 privateulong8d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_const_vector8_d(private const long8 privateconstlong8d,\n"
-        "                                    private const ulong8 privateconstulong8d)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector16_p1(constant long16* constantlong16p,\n"
-        "                                constant ulong16 * constantulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void constant_vector16_restrict_p1(constant long16 * restrict constantlong16restrictp,\n"
-        "                                         constant ulong16*restrict constantulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector16_p(global long16* globallong16p,\n"
-        "                              global ulong16 * globalulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_vector16_restrict_p(global long16 * restrict globallong16restrictp,\n"
-        "                                       global ulong16*restrict globalulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector16_p(global const long16*globalconstlong16p,\n"
-        "                                    global const ulong16 *globalconstulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_vector16_restrict_p(global const long16 *restrict globalconstlong16restrictp,\n"
-        "                                             global const ulong16* restrict globalconstulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector16_p(global volatile long16* globalvolatilelong16p,\n"
-        "                                       global volatile ulong16 * globalvolatileulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_volatile_vector16_restrict_p(global volatile long16 * restrict globalvolatilelong16restrictp,\n"
-        "                                                global volatile ulong16*restrict globalvolatileulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector16_p(global const volatile long16*globalconstvolatilelong16p,\n"
-        "                                             global const volatile ulong16 *globalconstvolatileulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void global_const_volatile_vector16_restrict_p(global const volatile long16 *restrict globalconstvolatilelong16restrictp,\n"
-        "                                                      global const volatile ulong16* restrict globalconstvolatileulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector16_p(local long16* locallong16p,\n"
-        "                             local ulong16 * localulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_vector16_restrict_p(local long16 * restrict locallong16restrictp,\n"
-        "                                      local ulong16*restrict localulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector16_p(local const long16*localconstlong16p,\n"
-        "                                   local const ulong16 *localconstulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_vector16_restrict_p(local const long16 *restrict localconstlong16restrictp,\n"
-        "                                            local const ulong16* restrict localconstulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector16_p(local volatile long16* localvolatilelong16p,\n"
-        "                                      local volatile ulong16 * localvolatileulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_volatile_vector16_restrict_p(local volatile long16 * restrict localvolatilelong16restrictp,\n"
-        "                                               local volatile ulong16*restrict localvolatileulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector16_p(local const volatile long16*localconstvolatilelong16p,\n"
-        "                                            local const volatile ulong16 *localconstvolatileulong16p)\n"
-        "{}\n",
-        "\n"
-        "kernel void local_const_volatile_vector16_restrict_p(local const volatile long16 *restrict localconstvolatilelong16restrictp,\n"
-        "                                                     local const volatile ulong16* restrict localconstvolatileulong16restrictp)\n"
-        "{}\n",
-        "\n"
-        "kernel void vector16_d(long16 long16d,\n"
-        "                       ulong16 ulong16d)\n"
-        "{}\n",
-        "\n"
-        "kernel void const_vector16_d(const long16 constlong16d,\n"
-        "                             const ulong16 constulong16d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_vector16_d(private long16 privatelong16d,\n"
-        "                               private ulong16 privateulong16d)\n"
-        "{}\n",
-        "\n"
-        "kernel void private_const_vector16_d(private const long16 privateconstlong16d,\n"
-        "                                     private const ulong16 privateconstulong16d)\n"
-        "{}\n",
-        "\n"
-};
-
-const char * long_arg_info[][72] = {
-  // The minimum value of CL_DEVICE_MAX_CONSTANT_ARGS is 4
-  {
-        "constant_scalar_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long*", "constantlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "constantulongp",
-        NULL
-  },
-  {
-        "constant_scalar_p3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "constantunsignedlongp",
-        NULL
-    },
-  {
-        "constant_scalar_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "constantlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "constantulongrestrictp",
-        NULL
-  },
-  {
-        "constant_scalar_restrict_p3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "constantunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "global_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long*", "globallongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "globalulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "globalunsignedlongp",
-        NULL
-    },
-    {
-        "global_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globallongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "global_const_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long*", "globalconstlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "globalconstulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "globalconstunsignedlongp",
-        NULL
-    },
-    {
-        "global_const_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globalconstlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "global_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "globalvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalvolatileunsignedlongp",
-        NULL
-    },
-    {
-        "global_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globalvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalvolatileunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "globalconstvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalconstvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalconstvolatileunsignedlongp",
-        NULL
-    },
-    {
-        "global_const_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globalconstvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstvolatileunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "local_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long*", "locallongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "localulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "localunsignedlongp",
-        NULL
-    },
-    {
-        "local_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "locallongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "local_const_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long*", "localconstlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "localconstulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "localconstunsignedlongp",
-        NULL
-    },
-    {
-        "local_const_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "localconstlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "local_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "localvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localvolatileunsignedlongp",
-        NULL
-    },
-    {
-        "local_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "localvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localvolatileunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "localconstvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localconstvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localconstvolatileunsignedlongp",
-        NULL
-    },
-    {
-        "local_const_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "localconstvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstvolatileunsignedlongrestrictp",
-        NULL
-    },
-    {
-        "scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "longd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "ulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "unsignedlongd",
-        NULL
-    },
-    {
-        "const_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "constlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "constulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "constunsignedlongd",
-        NULL
-    },
-    {
-        "private_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "privatelongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateunsignedlongd",
-        NULL
-    },
-    {
-        "private_const_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "privateconstlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateconstulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateconstunsignedlongd",
-        NULL
-    },
-    {
-        "constant_vector2_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long2*", "constantlong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong2*", "constantulong2p",
-        NULL
-    },
-    {
-        "constant_vector2_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "constantlong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "constantulong2restrictp",
-        NULL
-    },
-    {
-        "global_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2*", "globallong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2*", "globalulong2p",
-        NULL
-    },
-    {
-        "global_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globallong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalulong2restrictp",
-        NULL
-    },
-    {
-        "global_const_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long2*", "globalconstlong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong2*", "globalconstulong2p",
-        NULL
-    },
-    {
-        "global_const_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globalconstlong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalconstulong2restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "globalvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "globalvolatileulong2p",
-        NULL
-    },
-    {
-        "global_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globalvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalvolatileulong2restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "globalconstvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "globalconstvolatileulong2p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globalconstvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalconstvolatileulong2restrictp",
-        NULL
-    },
-    {
-        "local_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2*", "locallong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2*", "localulong2p",
-        NULL
-    },
-    {
-        "local_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "locallong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localulong2restrictp",
-        NULL
-    },
-    {
-        "local_const_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long2*", "localconstlong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong2*", "localconstulong2p",
-        NULL
-    },
-    {
-        "local_const_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "localconstlong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localconstulong2restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "localvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "localvolatileulong2p",
-        NULL
-    },
-    {
-        "local_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "localvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localvolatileulong2restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "localconstvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "localconstvolatileulong2p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "localconstvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localconstvolatileulong2restrictp",
-        NULL
-    },
-    {
-        "vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "long2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "ulong2d",
-        NULL
-    },
-    {
-        "const_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "constlong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "constulong2d",
-        NULL
-    },
-    {
-        "private_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "privatelong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "privateulong2d",
-        NULL
-    },
-    {
-        "private_const_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "privateconstlong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "privateconstulong2d",
-        NULL
-    },
-    {
-        "constant_vector3_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long3*", "constantlong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong3*", "constantulong3p",
-        NULL
-    },
-    {
-        "constant_vector3_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "constantlong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "constantulong3restrictp",
-        NULL
-    },
-    {
-        "global_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3*", "globallong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3*", "globalulong3p",
-        NULL
-    },
-    {
-        "global_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globallong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalulong3restrictp",
-        NULL
-    },
-    {
-        "global_const_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long3*", "globalconstlong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong3*", "globalconstulong3p",
-        NULL
-    },
-    {
-        "global_const_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globalconstlong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalconstulong3restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "globalvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "globalvolatileulong3p",
-        NULL
-    },
-    {
-        "global_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globalvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalvolatileulong3restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "globalconstvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "globalconstvolatileulong3p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globalconstvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalconstvolatileulong3restrictp",
-        NULL
-    },
-    {
-        "local_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3*", "locallong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3*", "localulong3p",
-        NULL
-    },
-    {
-        "local_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "locallong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localulong3restrictp",
-        NULL
-    },
-    {
-        "local_const_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long3*", "localconstlong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong3*", "localconstulong3p",
-        NULL
-    },
-    {
-        "local_const_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "localconstlong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localconstulong3restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "localvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "localvolatileulong3p",
-        NULL
-    },
-    {
-        "local_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "localvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localvolatileulong3restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "localconstvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "localconstvolatileulong3p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "localconstvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localconstvolatileulong3restrictp",
-        NULL
-    },
-    {
-        "vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "long3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "ulong3d",
-        NULL
-    },
-    {
-        "const_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "constlong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "constulong3d",
-        NULL
-    },
-    {
-        "private_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "privatelong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "privateulong3d",
-        NULL
-    },
-    {
-        "private_const_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "privateconstlong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "privateconstulong3d",
-        NULL
-    },
-    {
-        "constant_vector4_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long4*", "constantlong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong4*", "constantulong4p",
-        NULL
-    },
-    {
-        "constant_vector4_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "constantlong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "constantulong4restrictp",
-        NULL
-    },
-    {
-        "global_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4*", "globallong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4*", "globalulong4p",
-        NULL
-    },
-    {
-        "global_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globallong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalulong4restrictp",
-        NULL
-    },
-    {
-        "global_const_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long4*", "globalconstlong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong4*", "globalconstulong4p",
-        NULL
-    },
-    {
-        "global_const_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globalconstlong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalconstulong4restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "globalvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "globalvolatileulong4p",
-        NULL
-    },
-    {
-        "global_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globalvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalvolatileulong4restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "globalconstvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "globalconstvolatileulong4p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globalconstvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalconstvolatileulong4restrictp",
-        NULL
-    },
-    {
-        "local_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4*", "locallong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4*", "localulong4p",
-        NULL
-    },
-    {
-        "local_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "locallong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localulong4restrictp",
-        NULL
-    },
-    {
-        "local_const_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long4*", "localconstlong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong4*", "localconstulong4p",
-        NULL
-    },
-    {
-        "local_const_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "localconstlong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localconstulong4restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "localvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "localvolatileulong4p",
-        NULL
-    },
-    {
-        "local_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "localvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localvolatileulong4restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "localconstvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "localconstvolatileulong4p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "localconstvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localconstvolatileulong4restrictp",
-        NULL
-    },
-    {
-        "vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "long4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "ulong4d",
-        NULL
-    },
-    {
-        "const_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "constlong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "constulong4d",
-        NULL
-    },
-    {
-        "private_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "privatelong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "privateulong4d",
-        NULL
-    },
-    {
-        "private_const_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "privateconstlong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "privateconstulong4d",
-        NULL
-    },
-    {
-        "constant_vector8_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long8*", "constantlong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong8*", "constantulong8p",
-        NULL
-    },
-    {
-        "constant_vector8_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "constantlong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "constantulong8restrictp",
-        NULL
-    },
-    {
-        "global_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8*", "globallong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8*", "globalulong8p",
-        NULL
-    },
-    {
-        "global_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globallong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalulong8restrictp",
-        NULL
-    },
-    {
-        "global_const_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long8*", "globalconstlong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong8*", "globalconstulong8p",
-        NULL
-    },
-    {
-        "global_const_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globalconstlong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalconstulong8restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "globalvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "globalvolatileulong8p",
-        NULL
-    },
-    {
-        "global_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globalvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalvolatileulong8restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "globalconstvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "globalconstvolatileulong8p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globalconstvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalconstvolatileulong8restrictp",
-        NULL
-    },
-    {
-        "local_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8*", "locallong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8*", "localulong8p",
-        NULL
-    },
-    {
-        "local_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "locallong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localulong8restrictp",
-        NULL
-    },
-    {
-        "local_const_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long8*", "localconstlong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong8*", "localconstulong8p",
-        NULL
-    },
-    {
-        "local_const_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "localconstlong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localconstulong8restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "localvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "localvolatileulong8p",
-        NULL
-    },
-    {
-        "local_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "localvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localvolatileulong8restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "localconstvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "localconstvolatileulong8p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "localconstvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localconstvolatileulong8restrictp",
-        NULL
-    },
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <iostream>
+#include <vector>
+#include "testBase.h"
+#include "harness/errorHelpers.h"
+#include "harness/typeWrappers.h"
+#include "harness/kernelHelpers.h"
+
+#define MINIMUM_OPENCL_PIPE_VERSION Version(2, 0)
+
+static constexpr size_t CL_VERSION_LENGTH = 128;
+static constexpr size_t KERNEL_ARGUMENT_LENGTH = 128;
+static constexpr char KERNEL_ARGUMENT_NAME[] = "argument";
+static constexpr size_t KERNEL_ARGUMENT_NAME_LENGTH =
+    sizeof(KERNEL_ARGUMENT_NAME) + 1;
+static constexpr int SINGLE_KERNEL_ARG_NUMBER = 0;
+static constexpr int MAX_NUMBER_OF_KERNEL_ARGS = 128;
+
+static const std::vector<cl_kernel_arg_address_qualifier> address_qualifiers = {
+    CL_KERNEL_ARG_ADDRESS_GLOBAL, CL_KERNEL_ARG_ADDRESS_LOCAL,
+    CL_KERNEL_ARG_ADDRESS_CONSTANT, CL_KERNEL_ARG_ADDRESS_PRIVATE
+};
+
+static const std::vector<std::string> image_arguments = {
+    "image2d_t", "image3d_t",        "image2d_array_t",
+    "image1d_t", "image1d_buffer_t", "image1d_array_t"
+};
+
+static const std::vector<cl_kernel_arg_access_qualifier> access_qualifiers = {
+    CL_KERNEL_ARG_ACCESS_READ_WRITE, CL_KERNEL_ARG_ACCESS_READ_ONLY,
+    CL_KERNEL_ARG_ACCESS_WRITE_ONLY
+};
+
+static const std::vector<cl_kernel_arg_type_qualifier> type_qualifiers = {
+    CL_KERNEL_ARG_TYPE_NONE,
+    CL_KERNEL_ARG_TYPE_CONST,
+    CL_KERNEL_ARG_TYPE_VOLATILE,
+    CL_KERNEL_ARG_TYPE_RESTRICT,
+    CL_KERNEL_ARG_TYPE_CONST | CL_KERNEL_ARG_TYPE_VOLATILE,
+    CL_KERNEL_ARG_TYPE_CONST | CL_KERNEL_ARG_TYPE_RESTRICT,
+    CL_KERNEL_ARG_TYPE_VOLATILE | CL_KERNEL_ARG_TYPE_RESTRICT,
+    CL_KERNEL_ARG_TYPE_CONST | CL_KERNEL_ARG_TYPE_VOLATILE
+        | CL_KERNEL_ARG_TYPE_RESTRICT,
+};
+
+static const std::vector<cl_kernel_arg_type_qualifier> pipe_qualifiers = {
+    CL_KERNEL_ARG_TYPE_PIPE,
+    CL_KERNEL_ARG_TYPE_CONST | CL_KERNEL_ARG_TYPE_PIPE,
+    CL_KERNEL_ARG_TYPE_VOLATILE | CL_KERNEL_ARG_TYPE_PIPE,
+    CL_KERNEL_ARG_TYPE_CONST | CL_KERNEL_ARG_TYPE_VOLATILE
+        | CL_KERNEL_ARG_TYPE_PIPE,
+};
+
+static std::string
+get_address_qualifier(cl_kernel_arg_address_qualifier address_qualifier)
+{
+    std::string ret;
+    if (address_qualifier == CL_KERNEL_ARG_ADDRESS_GLOBAL)
+        ret = "global";
+    else if (address_qualifier == CL_KERNEL_ARG_ADDRESS_CONSTANT)
+        ret = "constant";
+    else if (address_qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL)
+        ret = "local";
+    else if (address_qualifier == CL_KERNEL_ARG_ADDRESS_PRIVATE)
+        ret = "private";
+    return ret;
+}
+
+static std::string
+get_access_qualifier(cl_kernel_arg_access_qualifier qualifier)
+{
+    std::string ret;
+    if (qualifier == CL_KERNEL_ARG_ACCESS_READ_ONLY) ret = "read_only";
+    if (qualifier == CL_KERNEL_ARG_ACCESS_WRITE_ONLY) ret = "write_only";
+    if (qualifier == CL_KERNEL_ARG_ACCESS_READ_WRITE) ret = "read_write";
+    return ret;
+}
+
+static std::string
+get_type_qualifier_prefix(cl_kernel_arg_type_qualifier type_qualifier)
+{
+    std::string ret;
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_CONST) ret += "const ";
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_VOLATILE) ret += "volatile ";
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_PIPE) ret += "pipe ";
+    return ret;
+}
+
+static std::string
+get_type_qualifier_postfix(cl_kernel_arg_type_qualifier type_qualifier)
+{
+    std::string ret;
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_RESTRICT) ret = "restrict";
+    return ret;
+}
+
+class KernelArgInfo {
+public:
+    KernelArgInfo(cl_kernel_arg_address_qualifier input_address_qualifier,
+                  cl_kernel_arg_access_qualifier input_access_qualifier,
+                  cl_kernel_arg_type_qualifier input_type_qualifier,
+                  const std::string& input_arg_type, const int argument_number,
+                  const std::string& input_arg_string = "")
+        : address_qualifier(input_address_qualifier),
+          access_qualifier(input_access_qualifier),
+          type_qualifier(input_type_qualifier), arg_string(input_arg_string)
+    {
+        strcpy(arg_type, input_arg_type.c_str());
+        std::string input_arg_name =
+            KERNEL_ARGUMENT_NAME + std::to_string(argument_number);
+        strcpy(arg_name, input_arg_name.c_str());
+    };
+    KernelArgInfo() = default;
+    cl_kernel_arg_address_qualifier address_qualifier;
+    cl_kernel_arg_access_qualifier access_qualifier;
+    cl_kernel_arg_type_qualifier type_qualifier;
+    char arg_type[KERNEL_ARGUMENT_LENGTH];
+    char arg_name[KERNEL_ARGUMENT_LENGTH];
+    std::string arg_string;
+};
+
+static std::string generate_argument(const KernelArgInfo& kernel_arg)
+{
+    std::string ret;
+
+    const bool is_image = strstr(kernel_arg.arg_type, "image")
+        || strstr(kernel_arg.arg_type, "sampler");
+    std::string address_qualifier = "";
+    // Image Objects are always allocated from the global address space so the
+    // qualifier should not be specified
+    if (!is_image)
     {
-        "vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "long8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "ulong8d",
-        NULL
-    },
+        address_qualifier = get_address_qualifier(kernel_arg.address_qualifier);
+    }
+
+    std::string access_qualifier =
+        get_access_qualifier(kernel_arg.access_qualifier);
+    std::string type_qualifier_prefix =
+        get_type_qualifier_prefix(kernel_arg.type_qualifier);
+    std::string type_qualifier_postfix =
+        get_type_qualifier_postfix(kernel_arg.type_qualifier);
+
+    ret += address_qualifier + " ";
+    ret += access_qualifier + " ";
+    ret += type_qualifier_prefix + " ";
+    ret += kernel_arg.arg_type;
+    ret += " ";
+    ret += type_qualifier_postfix + " ";
+    ret += kernel_arg.arg_name;
+    return ret;
+}
+
+/* This function generates a kernel source and allows for multiple arguments to
+ * be passed in and subsequently queried. */
+static std::string generate_kernel(const std::vector<KernelArgInfo>& all_args,
+                                   const bool supports_3d_image_writes = false)
+{
+
+    std::string ret;
+    if (supports_3d_image_writes)
     {
-        "const_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "constlong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "constulong8d",
-        NULL
-    },
+        ret += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable\n";
+    }
+    ret += "kernel void get_kernel_arg_info(\n";
+    for (int i = 0; i < all_args.size(); ++i)
     {
-        "private_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "privatelong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "privateulong8d",
-        NULL
-    },
+        const KernelArgInfo& arg = all_args[i];
+        ret += generate_argument(all_args[i]);
+        if (i == all_args.size() - 1)
+        {
+            ret += "\n";
+        }
+        else
+        {
+            ret += ",\n";
+        }
+    }
+    ret += "){}";
+    return ret;
+}
+
+static const char* get_kernel_arg_address_qualifier(
+    cl_kernel_arg_address_qualifier address_qualifier)
+{
+    switch (address_qualifier)
     {
-        "private_const_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "privateconstlong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "privateconstulong8d",
-        NULL
-    },
+        case CL_KERNEL_ARG_ADDRESS_GLOBAL: {
+            return "GLOBAL";
+        }
+        case CL_KERNEL_ARG_ADDRESS_LOCAL: {
+            return "LOCAL";
+        }
+        case CL_KERNEL_ARG_ADDRESS_CONSTANT: {
+            return "CONSTANT";
+        }
+        default: {
+            return "PRIVATE";
+        }
+    }
+}
+
+static const char*
+get_kernel_arg_access_qualifier(cl_kernel_arg_access_qualifier access_qualifier)
+{
+    switch (access_qualifier)
     {
-        "constant_vector16_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long16*", "constantlong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong16*", "constantulong16p",
-        NULL
-    },
+        case CL_KERNEL_ARG_ACCESS_READ_ONLY: {
+            return "READ_ONLY";
+        }
+        case CL_KERNEL_ARG_ACCESS_WRITE_ONLY: {
+            return "WRITE_ONLY";
+        }
+        case CL_KERNEL_ARG_ACCESS_READ_WRITE: {
+            return "READ_WRITE";
+        }
+        default: {
+            return "NONE";
+        }
+    }
+}
+
+std::string
+get_kernel_arg_type_qualifier(cl_kernel_arg_type_qualifier type_qualifier)
+{
+    std::string ret;
+
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_CONST) ret += "CONST ";
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_RESTRICT) ret += "RESTRICT ";
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_VOLATILE) ret += "VOLATILE ";
+    if (type_qualifier & CL_KERNEL_ARG_TYPE_PIPE) ret += "PIPE";
+
+    return ret;
+}
+
+static void output_difference(const KernelArgInfo& expected,
+                              const KernelArgInfo& actual)
+{
+    if (actual.address_qualifier != expected.address_qualifier)
     {
-        "constant_vector16_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "constantlong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "constantulong16restrictp",
-        NULL
-    },
+        log_error("Address Qualifier: Expected: %s\t Actual: %s\n",
+                  get_kernel_arg_address_qualifier(expected.address_qualifier),
+                  get_kernel_arg_address_qualifier(actual.address_qualifier));
+    }
+    if (actual.access_qualifier != expected.access_qualifier)
     {
-        "global_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16*", "globallong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16*", "globalulong16p",
-        NULL
-    },
+        log_error("Access Qualifier: Expected: %s\t Actual: %s\n",
+                  get_kernel_arg_access_qualifier(expected.access_qualifier),
+                  get_kernel_arg_access_qualifier(actual.access_qualifier));
+    }
+    if (actual.type_qualifier != expected.type_qualifier)
     {
-        "global_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globallong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalulong16restrictp",
-        NULL
-    },
+        log_error(
+            "Type Qualifier: Expected: %s\t Actual: %s\n",
+            get_kernel_arg_type_qualifier(expected.type_qualifier).c_str(),
+            get_kernel_arg_type_qualifier(actual.type_qualifier).c_str());
+    }
+    if (strcmp(actual.arg_type, expected.arg_type) != 0)
     {
-        "global_const_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long16*", "globalconstlong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong16*", "globalconstulong16p",
-        NULL
-    },
+        log_error("Arg Type: Expected: %s\t Actual: %s\n", expected.arg_type,
+                  actual.arg_type);
+    }
+    if (strcmp(actual.arg_name, expected.arg_name) != 0)
     {
-        "global_const_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globalconstlong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalconstulong16restrictp",
-        NULL
-    },
+        log_error("Arg Name: Expected: %s\t Actual: %s\n", expected.arg_name,
+                  actual.arg_name);
+    }
+    log_error("Argument in Kernel Source Reported as:\n%s\n",
+              expected.arg_string.c_str());
+}
+static int compare_expected_actual(const KernelArgInfo& expected,
+                                   const KernelArgInfo& actual)
+{
+    ++gTestCount;
+    int ret = TEST_PASS;
+    if ((actual.address_qualifier != expected.address_qualifier)
+        || (actual.access_qualifier != expected.access_qualifier)
+        || (actual.type_qualifier != expected.type_qualifier)
+        || (strcmp(actual.arg_type, expected.arg_type) != 0)
+        || (strcmp(actual.arg_name, expected.arg_name) != 0))
+    {
+        ret = TEST_FAIL;
+        output_difference(expected, actual);
+        ++gFailCount;
+    }
+    return ret;
+}
+
+static bool device_supports_pipes(cl_device_id deviceID)
+{
+    auto version = get_device_cl_version(deviceID);
+    if (version < MINIMUM_OPENCL_PIPE_VERSION)
     {
-        "global_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "globalvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "globalvolatileulong16p",
-        NULL
-    },
+        return false;
+    }
+    cl_uint max_packet_size = 0;
+    cl_int err =
+        clGetDeviceInfo(deviceID, CL_DEVICE_PIPE_MAX_PACKET_SIZE,
+                        sizeof(max_packet_size), &max_packet_size, nullptr);
+    test_error_ret(err, "clGetDeviceInfo", false);
+    if ((max_packet_size == 0) && (version >= Version(3, 0)))
+    {
+        return false;
+    }
+    return true;
+}
+
+static std::string get_build_options(cl_device_id deviceID)
+{
+    std::string ret = "-cl-kernel-arg-info";
+    if (get_device_cl_version(deviceID) >= MINIMUM_OPENCL_PIPE_VERSION)
     {
-        "global_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globalvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalvolatileulong16restrictp",
-        NULL
-    },
+        if (device_supports_pipes(deviceID))
+        {
+            if (get_device_cl_version(deviceID) >= Version(3, 0))
+            {
+                ret += " -cl-std=CL3.0";
+            }
+            else
+            {
+                ret += " -cl-std=CL2.0";
+            }
+        }
+    }
+    return ret;
+}
+
+static std::string get_expected_arg_type(const std::string& type_string,
+                                         const bool is_pointer)
+{
+    bool is_unsigned = false;
+    std::istringstream type_stream(type_string);
+    std::string base_type = "";
+    std::string ret = "";
+    /* Signed and Unsigned on their own represent an int */
+    if (type_string == "signed" || type_string == "signed*")
+    {
+        base_type = "int";
+    }
+    else if (type_string == "unsigned" || type_string == "unsigned*")
     {
-        "global_const_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "globalconstvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "globalconstvolatileulong16p",
-        NULL
-    },
+        base_type = "int";
+        is_unsigned = true;
+    }
+    else
     {
-        "global_const_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globalconstvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalconstvolatileulong16restrictp",
-        NULL
-    },
+        std::string token;
+        /* Iterate through the argument type to determine what the type is and
+         * whether or not it is signed */
+        while (std::getline(type_stream, token, ' '))
+        {
+            if (token.find("unsigned") != std::string::npos)
+            {
+                is_unsigned = true;
+            }
+            if (token.find("signed") == std::string::npos)
+            {
+                base_type = token;
+            }
+        }
+    }
+    ret = base_type;
+    if (is_unsigned)
     {
-        "local_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16*", "locallong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16*", "localulong16p",
-        NULL
-    },
+        ret.insert(0, "u");
+    }
+    /* Ensure that the data type is a pointer if it is not already when
+     * necessary */
+    if (is_pointer && ret.back() != '*')
     {
-        "local_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "locallong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localulong16restrictp",
-        NULL
-    },
+        ret += "*";
+    }
+    return ret;
+}
+
+static KernelArgInfo
+create_expected_arg_info(const KernelArgInfo& kernel_argument, bool is_pointer)
+{
+    KernelArgInfo ret = kernel_argument;
+    const std::string arg_string = generate_argument(kernel_argument);
+    ret.arg_string = arg_string;
+
+    std::string type_string(kernel_argument.arg_type);
+    /* We only need to modify the expected return values for scalar types */
+    if ((is_pointer && !isdigit(type_string.back() - 1))
+        || !isdigit(type_string.back()))
+    {
+        std::string expected_arg_type =
+            get_expected_arg_type(type_string, is_pointer);
+
+        /* Reset the Contents of expected arg_type char[] and then assign it to
+         * the expected value */
+        memset(ret.arg_type, 0, sizeof(ret.arg_type));
+        strcpy(ret.arg_type, expected_arg_type.c_str());
+    }
+
+    /* Any values passed by reference has TYPE_NONE */
+    if (!is_pointer)
     {
-        "local_const_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long16*", "localconstlong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong16*", "localconstulong16p",
-        NULL
-    },
+        ret.type_qualifier = CL_KERNEL_ARG_TYPE_NONE;
+    }
+
+    /* If the address qualifier is CONSTANT we expect to see the TYPE_CONST
+     * qualifier*/
+    if (kernel_argument.address_qualifier == CL_KERNEL_ARG_ADDRESS_CONSTANT)
     {
-        "local_const_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "localconstlong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localconstulong16restrictp",
-        NULL
-    },
+        ret.type_qualifier |= CL_KERNEL_ARG_TYPE_CONST;
+    }
+
+    /* The PIPE qualifier is special. It can only be used in a global scope. It
+     * also ignores any other qualifiers */
+    if (kernel_argument.type_qualifier & CL_KERNEL_ARG_TYPE_PIPE)
     {
-        "local_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "localvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "localvolatileulong16p",
-        NULL
-    },
+        ret.address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+        ret.type_qualifier = CL_KERNEL_ARG_TYPE_PIPE;
+    }
+
+    return ret;
+}
+
+/* There are too many vector arguments for it to be worth writing down
+ * statically and are instead generated here and combined with all of the scalar
+ * and unsigned scalar types in a single data structure */
+static std::vector<std::string>
+generate_all_type_arguments(cl_device_id deviceID)
+{
+    std::vector<std::string> ret = {
+        "char",           "short",        "int",           "float",
+        "void",           "uchar",        "unsigned char", "ushort",
+        "unsigned short", "uint",         "unsigned int",  "char unsigned",
+        "short unsigned", "int unsigned", "signed short",  "signed int",
+        "signed long",    "short signed", "int signed",    "signed",
+        "unsigned"
+    };
+
+    std::vector<std::string> vector_types = { "char",   "uchar", "short",
+                                              "ushort", "int",   "uint",
+                                              "float" };
+    if (gHasLong)
+    {
+        ret.push_back("long");
+        ret.push_back("ulong");
+        ret.push_back("unsigned long");
+        ret.push_back("long unsigned");
+        ret.push_back("long signed");
+        vector_types.push_back("long");
+        vector_types.push_back("ulong");
+    }
+    if (device_supports_half(deviceID))
     {
-        "local_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "localvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localvolatileulong16restrictp",
-        NULL
-    },
+        vector_types.push_back("half");
+    }
+    if (device_supports_double(deviceID))
     {
-        "local_const_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "localconstvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "localconstvolatileulong16p",
-        NULL
-    },
+        vector_types.push_back("double");
+    }
+    static const std::vector<std::string> vector_values = { "2", "3", "4", "8",
+                                                            "16" };
+    for (auto vector_type : vector_types)
     {
-        "local_const_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "localconstvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localconstvolatileulong16restrictp",
-        NULL
-    },
+        for (auto vector_value : vector_values)
+        {
+            ret.push_back(vector_type + vector_value);
+        }
+    }
+    return ret;
+}
+
+static int
+compare_kernel_with_expected(cl_context context, cl_device_id deviceID,
+                             const char* kernel_src,
+                             const std::vector<KernelArgInfo>& expected_args)
+{
+    int failed_tests = 0;
+    clKernelWrapper kernel;
+    clProgramWrapper program;
+    cl_int err = create_single_kernel_helper_with_build_options(
+        context, &program, &kernel, 1, &kernel_src, "get_kernel_arg_info",
+        get_build_options(deviceID).c_str());
+    test_error(err, "create_single_kernel_helper_with_build_options");
+    for (int i = 0; i < expected_args.size(); ++i)
+    {
+        KernelArgInfo actual;
+        err = clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+                                 sizeof(actual.address_qualifier),
+                                 &(actual.address_qualifier), nullptr);
+        test_error(err, "clGetKernelArgInfo");
+
+        err = clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_ACCESS_QUALIFIER,
+                                 sizeof(actual.access_qualifier),
+                                 &(actual.access_qualifier), nullptr);
+        test_error(err, "clGetKernelArgInfo");
+
+        err = clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                                 sizeof(actual.type_qualifier),
+                                 &(actual.type_qualifier), nullptr);
+        test_error(err, "clGetKernelArgInfo");
+
+        err = clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME,
+                                 sizeof(actual.arg_type), &(actual.arg_type),
+                                 nullptr);
+        test_error(err, "clGetKernelArgInfo");
+
+        err = clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME,
+                                 sizeof(actual.arg_name), &(actual.arg_name),
+                                 nullptr);
+        test_error(err, "clGetKernelArgInfo");
+
+        failed_tests += compare_expected_actual(expected_args[i], actual);
+    }
+    return failed_tests;
+}
+
+size_t get_param_size(const std::string& arg_type, cl_device_id deviceID,
+                      bool is_pipe)
+{
+    if (is_pipe)
     {
-        "vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "long16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "ulong16d",
-        NULL
-    },
+        return (sizeof(int*));
+    }
+    if (arg_type.find("*") != std::string::npos)
     {
-        "const_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "constlong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "constulong16d",
-        NULL
-    },
+        cl_uint device_address_bits = 0;
+        cl_int err = clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS,
+                                     sizeof(device_address_bits),
+                                     &device_address_bits, NULL);
+        return (device_address_bits / 8);
+    }
+
+    size_t ret(0);
+    if (arg_type.find("char") != std::string::npos)
     {
-        "private_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "privatelong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "privateulong16d",
-        NULL
-    },
+        ret += sizeof(cl_char);
+    }
+    if (arg_type.find("short") != std::string::npos)
     {
-        "private_const_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "privateconstlong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "privateconstulong16d",
-        NULL
-    },
-};
-
-template<typename arg_info_t>
-int test(cl_device_id deviceID, cl_context context, kernel_args_t kernel_args, cl_uint lines_count, arg_info_t arg_info, size_t total_kernels_in_program) {
-
-    const size_t max_name_len = 512;
-    cl_char name[ max_name_len ];
-    cl_uint arg_count, numArgs;
-    size_t i, j, size;
-    int error;
-
-    clProgramWrapper program =
-    clCreateProgramWithSource(context, lines_count, kernel_args, NULL, &error);
-    if ( program == NULL || error != CL_SUCCESS )
+        ret += sizeof(cl_short);
+    }
+    if (arg_type.find("half") != std::string::npos)
     {
-        print_error( error, "Unable to create required arguments kernel program" );
-        return -1;
+        ret += sizeof(cl_half);
     }
-
-    // Compile the program
-    log_info( "Building kernels...\n" );
-    clBuildProgram( program, 1, &deviceID, "-cl-kernel-arg-info", NULL, NULL );
-
-    // check for build errors and exit if things didn't work
-    size_t size_ret;
-    cl_build_status build_status;
-    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof(build_status), &build_status, &size_ret);
-    test_error( error, "Unable to query build status" );
-    if (build_status == CL_BUILD_ERROR) {
-        printf("CL_PROGRAM_BUILD_STATUS=%d\n", (int) build_status);
-        error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &size_ret);
-        test_error( error, "Unable to get build log size" );
-        char *build_log = (char *)malloc(size_ret);
-        error = clGetProgramBuildInfo(program,deviceID, CL_PROGRAM_BUILD_LOG, size_ret, build_log, &size_ret);
-        test_error( error, "Unable to get build log" );
-        printf("CL_PROGRAM_BUILD_LOG:\n%s\n", build_log);
-        printf("CL_BUILD_ERROR. exiting\n");
-        free(build_log);
-        return -1;
+    if (arg_type.find("int") != std::string::npos)
+    {
+        ret += sizeof(cl_int);
     }
-
-    // Lookup the number of kernels in the program.
-    log_info( "Testing kernels...\n" );
-    size_t total_kernels = 0;
-    error = clGetProgramInfo( program, CL_PROGRAM_NUM_KERNELS, sizeof( size_t ), &total_kernels, NULL );
-    test_error( error, "Unable to get program info num kernels" );
-
-    if ( total_kernels != total_kernels_in_program )
+    if (arg_type.find("long") != std::string::npos)
     {
-        print_error( error, "Program did not build all kernels" );
-        return -1;
+        ret += sizeof(cl_long);
     }
-
-    // Lookup the kernel names.
-    size_t kernel_names_len = 0;
-    error = clGetProgramInfo( program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &kernel_names_len );
-    test_error( error, "Unable to get length of kernel names list." );
-
-    size_t expected_kernel_names_len = 0;
-    for ( i = 0; i < total_kernels; ++i )
+    if (arg_type.find("float") != std::string::npos)
     {
-        expected_kernel_names_len += 1 + strlen( arg_info[ i ][ 0 ] );
+        ret += sizeof(cl_float);
     }
-    if ( kernel_names_len != expected_kernel_names_len )
+    if (arg_type.find("double") != std::string::npos)
     {
-        log_error( "Kernel names string is not the right length, expected %d, got %d\n", (int) expected_kernel_names_len, (int) kernel_names_len );
-        return -1;
+        ret += sizeof(cl_double);
     }
-
-    const size_t len = ( kernel_names_len + 1 ) * sizeof( char );
-    char* kernel_names = (char*) malloc( len );
-    error = clGetProgramInfo( program, CL_PROGRAM_KERNEL_NAMES, len, kernel_names, &kernel_names_len );
-    test_error( error, "Unable to get kernel names list." );
-
-    // Check to see if the kernel name array is null terminated.
-    if ( kernel_names[ kernel_names_len - 1 ] != '\0' )
+    if (arg_type.back() == '2')
     {
-        free( kernel_names );
-        print_error( error, "Kernel name list was not null terminated" );
-        return -1;
+        ret *= 2;
     }
-
-    // Check to see if the correct kernel name string was returned.
-    // Does the string contain each expected kernel name?
-    for ( i = 0; i < total_kernels; ++i )
-        if ( !strstr( kernel_names, arg_info[ i ][ 0 ] ) )
-            break;
-    if ( i != total_kernels )
+    if (arg_type.back() == '3')
     {
-        log_error( "Kernel names string is missing \"%s\"\n", arg_info[ i ][ 0 ] );
-        free( kernel_names );
-        return -1;
+        ret *= 4;
     }
-
-    // Are the kernel names delimited by ';'?
-    if ( !strtok( kernel_names, ";" ) )
+    if (arg_type.back() == '4')
     {
-        error = -1;
+        ret *= 4;
     }
-    else
+    if (arg_type.back() == '8')
     {
-        for ( i = 1; i < total_kernels; ++i )
-        {
-            if ( !strtok( NULL, ";" ) )
-            {
-                error = -1;
-            }
-        }
+        ret *= 8;
     }
-    if ( error )
+    // If the last character is a 6 it represents a vector of 16
+    if (arg_type.back() == '6')
     {
-        log_error( "Kernel names string was not properly delimited by ';'\n" );
-        free( kernel_names );
-        return -1;
+        ret *= 16;
     }
-    free( kernel_names );
+    return ret;
+}
 
-    // Create kernel objects and query them.
-    int rc = 0;
-    for ( i = 0; i < total_kernels; ++i )
-    {
-        int kernel_rc = 0;
-        const char* kernel_name = arg_info[ i ][ 0 ];
-        clKernelWrapper kernel = clCreateKernel(program, kernel_name, &error);
-        if( kernel == NULL || error != CL_SUCCESS )
-        {
-            log_error( "ERROR: Could not get kernel: %s\n", kernel_name );
-            kernel_rc = -1;
-        }
+static int run_scalar_vector_tests(cl_context context, cl_device_id deviceID)
+{
+    int failed_tests = 0;
 
-        if(kernel_rc == 0)
-        {
-            // Determine the expected number of arguments.
-            arg_count = 0;
-            while (arg_info[ i ][ (ARG_INFO_FIELD_COUNT * arg_count) + 1 ] != NULL)
-                ++arg_count;
+    std::vector<std::string> type_arguments =
+        generate_all_type_arguments(deviceID);
 
-            // Try to get the number of arguments.
-            error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, 0, NULL, &size );
-            test_error( error, "Unable to get kernel arg count param size" );
-            if( size != sizeof( numArgs ) )
-            {
-                log_error( "ERROR: Kernel arg count param returns invalid size (expected %d, got %d) for kernel: %s\n", (int)sizeof( numArgs ), (int)size, kernel_name );
-                kernel_rc = -1;
-            }
-        }
+    const std::vector<cl_kernel_arg_access_qualifier> access_qualifiers = {
+        CL_KERNEL_ARG_ACCESS_NONE, CL_KERNEL_ARG_ACCESS_READ_ONLY,
+        CL_KERNEL_ARG_ACCESS_WRITE_ONLY
+    };
 
+    std::vector<KernelArgInfo> all_args, expected_args;
+    size_t max_param_size = get_max_param_size(deviceID);
+    size_t total_param_size(0);
+    for (auto address_qualifier : address_qualifiers)
+    {
+        bool is_private = (address_qualifier == CL_KERNEL_ARG_ADDRESS_PRIVATE);
 
-        if(kernel_rc == 0)
-        {
-            error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, sizeof( numArgs ), &numArgs, NULL );
-            test_error( error, "Unable to get kernel arg count" );
-            if( numArgs != arg_count )
-            {
-                log_error( "ERROR: Kernel arg count returned invalid value (expected %d, got %d) for kernel: %s\n", arg_count, numArgs, kernel_name );
-                kernel_rc = -1;
-            }
-        }
+        /* OpenCL kernels cannot take "private" pointers and only "private"
+         * variables can take values */
+        bool is_pointer = !is_private;
 
-        if(kernel_rc == 0)
+        for (auto type_qualifier : type_qualifiers)
         {
-            for ( j = 0; j < numArgs; ++j )
+            bool is_pipe = (type_qualifier & CL_KERNEL_ARG_TYPE_PIPE);
+            bool is_restrict = (type_qualifier & CL_KERNEL_ARG_TYPE_RESTRICT);
+
+            for (auto access_qualifier : access_qualifiers)
             {
+                bool has_access_qualifier =
+                    (access_qualifier != CL_KERNEL_ARG_ACCESS_NONE);
 
-                int arg_rc = 0;
-                cl_kernel_arg_address_qualifier expected_address_qualifier = (cl_kernel_arg_address_qualifier)(uintptr_t)arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_ADDR_OFFSET ];
-                cl_kernel_arg_access_qualifier expected_access_qualifier =  (cl_kernel_arg_access_qualifier)(uintptr_t)arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_ACCESS_OFFSET ];
-                cl_kernel_arg_type_qualifier expected_type_qualifier = (cl_kernel_arg_type_qualifier)(uintptr_t)arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_TYPE_QUAL_OFFSET ];
-                const char* expected_type_name = arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_TYPE_NAME_OFFSET ];
-                const char* expected_arg_name = arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_ARG_NAME_OFFSET ];
+                /*Only images and pipes can have an access qualifier,
+                 * otherwise it should be ACCESS_NONE */
+                if (!is_pipe && has_access_qualifier)
+                {
+                    continue;
+                }
 
-                // Try to get the address qualifier of each argument.
-                cl_kernel_arg_address_qualifier address_qualifier = 0;
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_ADDRESS_QUALIFIER, sizeof address_qualifier, &address_qualifier, &size );
-                test_error( error, "Unable to get argument address qualifier" );
-                error = (address_qualifier != expected_address_qualifier);
-                if ( error )
+                /* If the type is a pipe, then either the specified or
+                 * default access qualifier is returned and so "NONE" will
+                 * never be returned */
+                if (is_pipe && !has_access_qualifier)
                 {
-                    log_error( "ERROR: Bad address qualifier, kernel: \"%s\", argument number: %d, expected \"0x%X\", got \"0x%X\"\n", kernel_name, (unsigned int)j, (unsigned int)expected_address_qualifier, (unsigned int)address_qualifier );
-                    arg_rc = -1;
+                    continue;
                 }
 
-                // Try to get the access qualifier of each argument.
-                cl_kernel_arg_access_qualifier access_qualifier = 0;
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_ACCESS_QUALIFIER, sizeof access_qualifier, &access_qualifier, &size );
-                test_error( error, "Unable to get argument access qualifier" );
-                error = (access_qualifier != expected_access_qualifier);
-                if ( error )
+                /* The "restrict" type qualifier can only apply to
+                 * pointers
+                 */
+                if (is_restrict && !is_pointer)
                 {
-                    log_error( "ERROR: Bad access qualifier, kernel: \"%s\", argument number: %d, expected \"0x%X\", got \"0x%X\"\n", kernel_name, (unsigned int)j, (unsigned int)expected_access_qualifier, (unsigned int)access_qualifier );
-                    arg_rc = -1;
+                    continue;
                 }
 
-                // Try to get the type qualifier of each argument.
-                cl_kernel_arg_type_qualifier arg_type_qualifier = 0;
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_TYPE_QUALIFIER, sizeof arg_type_qualifier, &arg_type_qualifier, &size );
-                test_error( error, "Unable to get argument type qualifier" );
-                error = (arg_type_qualifier != expected_type_qualifier);
-                if ( error )
+                /* We cannot have pipe pointers */
+                if (is_pipe && is_pointer)
                 {
-                    log_error( "ERROR: Bad type qualifier, kernel: \"%s\", argument number: %d, expected \"0x%X\", got \"0x%X\"\n", kernel_name, (unsigned int)j, (unsigned int)expected_type_qualifier, (unsigned int)arg_type_qualifier );
-                    arg_rc = -1;
+                    continue;
                 }
 
-                // Try to get the type of each argument.
-                memset( name, 0, max_name_len );
-                error = clGetKernelArgInfo(kernel, (cl_uint)j, CL_KERNEL_ARG_TYPE_NAME, max_name_len, name, &size );
-                test_error( error, "Unable to get argument type name" );
-                error = strcmp( (const char*) name, expected_type_name );
-                if ( error )
+
+                for (auto arg_type : type_arguments)
                 {
-                    log_error( "ERROR: Bad argument type name, kernel: \"%s\", argument number: %d, expected \"%s\", got \"%s\"\n", kernel_name, (unsigned int)j, expected_type_name, name );
-                    arg_rc = -1;
+                    /* Void Types cannot be private */
+                    if (is_private && arg_type == "void")
+                    {
+                        continue;
+                    }
+
+                    if (is_pointer)
+                    {
+                        arg_type += "*";
+                    }
+                    size_t param_size =
+                        get_param_size(arg_type, deviceID, is_pipe);
+                    if (param_size + total_param_size >= max_param_size
+                        || all_args.size() == MAX_NUMBER_OF_KERNEL_ARGS)
+                    {
+                        const std::string kernel_src =
+                            generate_kernel(all_args);
+                        failed_tests += compare_kernel_with_expected(
+                            context, deviceID, kernel_src.c_str(),
+                            expected_args);
+                        all_args.clear();
+                        expected_args.clear();
+                        total_param_size = 0;
+                    }
+                    total_param_size += param_size;
+
+                    KernelArgInfo kernel_argument(
+                        address_qualifier, access_qualifier, type_qualifier,
+                        arg_type, all_args.size());
+
+                    expected_args.push_back(
+                        create_expected_arg_info(kernel_argument, is_pointer));
+
+                    all_args.push_back(kernel_argument);
                 }
+            }
+        }
+    }
+    const std::string kernel_src = generate_kernel(all_args);
+    failed_tests += compare_kernel_with_expected(
+        context, deviceID, kernel_src.c_str(), expected_args);
+    return failed_tests;
+}
+
+static cl_uint get_max_number_of_pipes(cl_device_id deviceID, cl_int& err)
+{
+    cl_uint ret(0);
+    err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PIPE_ARGS, sizeof(ret), &ret,
+                          nullptr);
+    return ret;
+}
 
-                // Try to get the name of each argument.
-                memset( name, 0, max_name_len );
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_NAME, max_name_len, name, &size );
-                test_error( error, "Unable to get argument name" );
-                error = strcmp( (const char*) name, expected_arg_name );
-                if ( error )
+static int run_pipe_tests(cl_context context, cl_device_id deviceID)
+{
+    int failed_tests = 0;
+
+    cl_kernel_arg_address_qualifier address_qualifier =
+        CL_KERNEL_ARG_ADDRESS_PRIVATE;
+    std::vector<std::string> type_arguments =
+        generate_all_type_arguments(deviceID);
+    const std::vector<cl_kernel_arg_access_qualifier> access_qualifiers = {
+        CL_KERNEL_ARG_ACCESS_READ_ONLY, CL_KERNEL_ARG_ACCESS_WRITE_ONLY
+    };
+    std::vector<KernelArgInfo> all_args, expected_args;
+    size_t max_param_size = get_max_param_size(deviceID);
+    size_t total_param_size(0);
+    cl_int err = CL_SUCCESS;
+    cl_uint max_number_of_pipes = get_max_number_of_pipes(deviceID, err);
+    test_error_ret(err, "get_max_number_of_pipes", TEST_FAIL);
+    cl_uint number_of_pipes(0);
+
+    const bool is_pointer = false;
+    const bool is_pipe = true;
+
+    for (auto type_qualifier : pipe_qualifiers)
+    {
+        for (auto access_qualifier : access_qualifiers)
+        {
+            for (auto arg_type : type_arguments)
+            {
+                /* We cannot have void pipes */
+                if (arg_type == "void")
                 {
-                    log_error( "ERROR: Bad argument name, kernel: \"%s\", argument number: %d, expected \"%s\", got \"%s\"\n", kernel_name, (unsigned int)j, expected_arg_name, name );
-                    arg_rc = -1;
+                    continue;
                 }
 
-                if(arg_rc != 0) {
-                    kernel_rc = -1;
+                size_t param_size = get_param_size(arg_type, deviceID, is_pipe);
+                if (param_size + total_param_size >= max_param_size
+                    || number_of_pipes == max_number_of_pipes)
+                {
+                    const std::string kernel_src = generate_kernel(all_args);
+                    failed_tests += compare_kernel_with_expected(
+                        context, deviceID, kernel_src.c_str(), expected_args);
+                    all_args.clear();
+                    expected_args.clear();
+                    total_param_size = 0;
+                    number_of_pipes = 0;
                 }
-            }
-        }
+                total_param_size += param_size;
+                number_of_pipes++;
 
-        //log_info( "%s ... %s\n",arg_info[i][0],kernel_rc == 0 ? "passed" : "failed" );
-        if(kernel_rc != 0) {
-            rc = -1;
+                KernelArgInfo kernel_argument(address_qualifier,
+                                              access_qualifier, type_qualifier,
+                                              arg_type, all_args.size());
+
+                expected_args.push_back(
+                    create_expected_arg_info(kernel_argument, is_pointer));
+
+                all_args.push_back(kernel_argument);
+            }
         }
     }
-  return rc;
+    const std::string kernel_src = generate_kernel(all_args);
+    failed_tests += compare_kernel_with_expected(
+        context, deviceID, kernel_src.c_str(), expected_args);
+    return failed_tests;
 }
 
-
-int    test_get_kernel_arg_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+static int run_sampler_test(cl_context context, cl_device_id deviceID)
 {
-    size_t size;
-    int error;
+    cl_kernel_arg_address_qualifier address_qualifier =
+        CL_KERNEL_ARG_ADDRESS_PRIVATE;
+    cl_kernel_arg_type_qualifier type_qualifier = CL_KERNEL_ARG_TYPE_NONE;
+    cl_kernel_arg_access_qualifier access_qualifier = CL_KERNEL_ARG_ACCESS_NONE;
+    std::string image_type = "sampler_t";
+    bool is_pointer = false;
+
+    KernelArgInfo kernel_argument(address_qualifier, access_qualifier,
+                                  type_qualifier, image_type,
+                                  SINGLE_KERNEL_ARG_NUMBER);
 
-    cl_bool supports_double = 0; // assume not
-    cl_bool supports_half = 0; // assume not
-    cl_bool supports_images = 0; // assume not
-    cl_bool supports_long = 0; // assume not
-    cl_bool supports_3D_images = 0; // assume not
+    KernelArgInfo expected =
+        create_expected_arg_info(kernel_argument, is_pointer);
 
-    // Check if this device supports images
-  error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_SUPPORT, sizeof supports_images, &supports_images, NULL);
-  test_error(error, "clGetDeviceInfo for CL_DEVICE_IMAGE_SUPPORT failed");
+    const std::string kernel_src = generate_kernel({ kernel_argument });
 
-  if (supports_images) {
-    log_info(" o Device supports images\n");
-    log_info(" o Expecting SUCCESS when testing image kernel arguments.\n");
-  }
-  else {
-    log_info(" o Device lacks image support\n");
-    log_info(" o Not testing image kernel arguments.\n");
-  }
+    return compare_kernel_with_expected(context, deviceID, kernel_src.c_str(),
+                                        { expected });
+}
 
-    if (is_extension_available(deviceID, "cl_khr_fp64")) {
-        log_info(" o Device claims extension 'cl_khr_fp64'\n");
-        log_info(" o Expecting SUCCESS when testing double kernel arguments.\n");
-        supports_double = 1;
-    } else {
-        cl_device_fp_config double_fp_config;
-        error = clGetDeviceInfo(deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(double_fp_config), &double_fp_config, NULL);
-        test_error(error, "clGetDeviceInfo for CL_DEVICE_DOUBLE_FP_CONFIG failed");
-        if (double_fp_config != 0)
-            supports_double = 1;
-        else {
-            log_info(" o Device lacks extension 'cl_khr_fp64'\n");
-            log_info(" o Not testing double kernel arguments.\n");
-            supports_double = 0;
+static int run_image_tests(cl_context context, cl_device_id deviceID)
+{
+    int failed_tests = 0;
+    bool supports_3d_image_writes =
+        is_extension_available(deviceID, "cl_khr_3d_image_writes");
+    bool is_pointer = false;
+    cl_kernel_arg_type_qualifier type_qualifier = CL_KERNEL_ARG_TYPE_NONE;
+    cl_kernel_arg_address_qualifier address_qualifier =
+        CL_KERNEL_ARG_ADDRESS_GLOBAL;
+
+    for (auto access_qualifier : access_qualifiers)
+    {
+        bool is_write =
+            (access_qualifier == CL_KERNEL_ARG_ACCESS_WRITE_ONLY
+             || access_qualifier == CL_KERNEL_ARG_ACCESS_READ_WRITE);
+        for (auto image_type : image_arguments)
+        {
+            bool is_3d_image = image_type == "image3d_t";
+            /* We can only test 3d image writes if our device supports it */
+            if (is_3d_image && is_write)
+            {
+                if (!supports_3d_image_writes)
+                {
+                    continue;
+                }
+            }
+            KernelArgInfo kernel_argument(address_qualifier, access_qualifier,
+                                          type_qualifier, image_type,
+                                          SINGLE_KERNEL_ARG_NUMBER);
+            KernelArgInfo expected =
+                create_expected_arg_info(kernel_argument, is_pointer);
+            const std::string kernel_src =
+                generate_kernel({ kernel_argument }, supports_3d_image_writes);
+
+            failed_tests += compare_kernel_with_expected(
+                context, deviceID, kernel_src.c_str(), { expected });
         }
     }
+    failed_tests += run_sampler_test(context, deviceID);
+    return failed_tests;
+}
 
-    if (is_extension_available(deviceID, "cl_khr_fp16")) {
-        log_info(" o Device claims extension 'cl_khr_fp16'\n");
-        log_info(" o Expecting SUCCESS when testing halfn* kernel arguments.\n");
-        supports_half = 1;
-    } else {
-        log_info(" o Device lacks extension 'cl_khr_fp16'\n");
-        log_info(" o Not testing halfn* kernel arguments.\n");
-        supports_half = 0;
-    }
+/* Ensure clGetKernelArgInfo returns successfully when param_value is
+ * set to null */
+static int test_null_param(cl_context context, cl_device_id deviceID,
+                           char const* kernel_src)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    cl_int err = create_single_kernel_helper_with_build_options(
+        context, &program, &kernel, 1, &kernel_src, "get_kernel_arg_info",
+        get_build_options(deviceID).c_str());
+    test_error_ret(err, "create_single_kernel_helper_with_build_options",
+                   TEST_FAIL);
+
+    err = clGetKernelArgInfo(kernel, SINGLE_KERNEL_ARG_NUMBER,
+                             CL_KERNEL_ARG_ADDRESS_QUALIFIER, 0, nullptr,
+                             nullptr);
+    test_error_ret(err, "clGetKernelArgInfo", TEST_FAIL);
+
+    err =
+        clGetKernelArgInfo(kernel, SINGLE_KERNEL_ARG_NUMBER,
+                           CL_KERNEL_ARG_ACCESS_QUALIFIER, 0, nullptr, nullptr);
+    test_error_ret(err, "clGetKernelArgInfo", TEST_FAIL);
+
+    err = clGetKernelArgInfo(kernel, SINGLE_KERNEL_ARG_NUMBER,
+                             CL_KERNEL_ARG_TYPE_QUALIFIER, 0, nullptr, nullptr);
+    test_error_ret(err, "clGetKernelArgInfo", TEST_FAIL);
+
+    err = clGetKernelArgInfo(kernel, SINGLE_KERNEL_ARG_NUMBER,
+                             CL_KERNEL_ARG_TYPE_NAME, 0, nullptr, nullptr);
+    test_error_ret(err, "clGetKernelArgInfo", TEST_FAIL);
+
+    err = clGetKernelArgInfo(kernel, SINGLE_KERNEL_ARG_NUMBER,
+                             CL_KERNEL_ARG_NAME, 0, nullptr, nullptr);
+    test_error_ret(err, "clGetKernelArgInfo", TEST_FAIL);
+
+    return TEST_PASS;
+}
 
-    if (is_extension_available(deviceID, "cl_khr_int64"))
-    {
-        log_info(" o Device claims extension 'cl_khr_int64'\n");
-        log_info(" o Expecting SUCCESS when testing long kernel arguments.\n");
-        supports_long = 1;
-    } else
+/* Ensure clGetKernelArgInfo returns the correct size in bytes for the
+ * kernel arg name */
+static int test_arg_name_size(cl_context context, cl_device_id deviceID,
+                              char const* kernel_src)
+{
+    size_t size;
+    /* We are adding +1 because the argument used in this kernel is argument0
+     * which has 1 extra character than just the base argument name */
+    char arg_return[sizeof(KERNEL_ARGUMENT_NAME) + 1];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    cl_int err = create_single_kernel_helper_with_build_options(
+        context, &program, &kernel, 1, &kernel_src, "get_kernel_arg_info",
+        get_build_options(deviceID).c_str());
+
+    test_error_ret(err, "create_single_kernel_helper_with_build_options",
+                   TEST_FAIL);
+
+    err =
+        clGetKernelArgInfo(kernel, SINGLE_KERNEL_ARG_NUMBER, CL_KERNEL_ARG_NAME,
+                           sizeof(arg_return), &arg_return, &size);
+    test_error_ret(err, "clGetKernelArgInfo", TEST_FAIL);
+    if (size == sizeof(KERNEL_ARGUMENT_NAME) + 1)
+    {
+        return TEST_PASS;
+    }
+    else
     {
-        log_info(" o Device lacks extension 'cl_khr_int64'\n");
-        log_info(" o Not testing long kernel arguments.\n");
-        supports_long = 0;
+        return TEST_FAIL;
     }
+}
 
-    error = checkFor3DImageSupport(deviceID);
-    if (error != CL_IMAGE_FORMAT_NOT_SUPPORTED)
-    {
-        log_info(" o Device supports 3D images\n");
-        log_info(" o Expecting SUCCESS when testing 3D image kernel arguments.\n");
-        supports_3D_images = 1;
-    } else
+static int run_boundary_tests(cl_context context, cl_device_id deviceID)
+{
+    int failed_tests = 0;
+
+    cl_kernel_arg_address_qualifier address_qualifier =
+        CL_KERNEL_ARG_ADDRESS_GLOBAL;
+    cl_kernel_arg_access_qualifier access_qualifier = CL_KERNEL_ARG_ACCESS_NONE;
+    cl_kernel_arg_type_qualifier type_qualifier = CL_KERNEL_ARG_TYPE_NONE;
+    std::string arg_type = "int*";
+    KernelArgInfo arg_info(address_qualifier, access_qualifier, type_qualifier,
+                           arg_type, SINGLE_KERNEL_ARG_NUMBER);
+    const std::string kernel_src = generate_kernel({ arg_info });
+
+    failed_tests += test_arg_name_size(context, deviceID, kernel_src.c_str());
+
+    if (test_null_param(context, deviceID, kernel_src.c_str()) != TEST_PASS)
     {
-        log_info(" o Device lacks 3D image support\n");
-        log_info(" o Not testing 3D image kernel arguments.\n");
-        supports_3D_images = 0;
+        failed_tests++;
     }
 
-    int test_failed = 0;
+    return failed_tests;
+}
 
-    // Now create a test program using required arguments
-    log_info("Testing required kernel arguments...\n");
-    error = test(deviceID, context, required_kernel_args, sizeof(required_kernel_args)/sizeof(required_kernel_args[0]), required_arg_info, sizeof(required_arg_info)/sizeof(required_arg_info[0]));
-    test_failed = (error) ? -1 : test_failed;
+static int run_all_tests(cl_context context, cl_device_id deviceID)
+{
 
-    if ( supports_images )
+    int failed_scalar_tests = run_scalar_vector_tests(context, deviceID);
+    if (failed_scalar_tests == 0)
     {
-        log_info("Testing optional image arguments...\n");
-        error = test(deviceID, context, image_kernel_args, sizeof(image_kernel_args)/sizeof(image_kernel_args[0]), image_arg_info, sizeof(image_arg_info)/sizeof(image_arg_info[0]));
-        test_failed = (error) ? -1 : test_failed;
+        log_info("All Data Type Tests Passed\n");
     }
-
-    if ( supports_double )
+    else
     {
-        log_info("Testing optional double arguments...\n");
-        error = test(deviceID, context, double_kernel_args, sizeof(double_kernel_args)/sizeof(double_kernel_args[0]), double_arg_info, sizeof(double_arg_info)/sizeof(double_arg_info[0]));
-        test_failed = (error) ? -1 : test_failed;
+        log_error("%d Data Type Test(s) Failed\n", failed_scalar_tests);
     }
 
-    if ( supports_half )
+    int failed_image_tests = 0;
+    if (checkForImageSupport(deviceID) == 0)
+    {
+        failed_image_tests = run_image_tests(context, deviceID);
+        if (failed_image_tests == 0)
+        {
+            log_info("All Image Tests Passed\n");
+        }
+        else
+        {
+            log_error("%d Image Test(s) Failed\n", failed_image_tests);
+        }
+    }
+    int failed_pipe_tests = 0;
+    // TODO https://github.com/KhronosGroup/OpenCL-CTS/issues/1244
+    if (false)
     {
-        log_info("Testing optional half arguments...\n");
-        error = test(deviceID, context, half_kernel_args, sizeof(half_kernel_args)/sizeof(half_kernel_args[0]), half_arg_info, sizeof(half_arg_info)/sizeof(half_arg_info[0]));
-        test_failed = (error) ? -1 : test_failed;
+        failed_pipe_tests = run_pipe_tests(context, deviceID);
+        if (failed_pipe_tests == 0)
+        {
+            log_info("All Pipe Tests Passed\n");
+        }
+        else
+        {
+            log_error("%d Pipe Test(s) Failed\n", failed_pipe_tests);
+        }
     }
 
-    if ( supports_long )
+    int failed_boundary_tests = run_boundary_tests(context, deviceID);
+    if (failed_boundary_tests == 0)
     {
-        log_info("Testing optional long arguments...\n");
-        error = test(deviceID, context, long_kernel_args, sizeof(long_kernel_args)/sizeof(long_kernel_args[0]), long_arg_info, sizeof(long_arg_info)/sizeof(long_arg_info[0]));
-        test_failed = (error) ? -1 : test_failed;
+        log_info("All Edge Case Tests Passed\n");
     }
-
-    if ( supports_3D_images )
+    else
     {
-        log_info("Testing optional 3D image arguments...\n");
-        error = test(deviceID, context, image_3D_kernel_args, sizeof(image_3D_kernel_args)/sizeof(image_3D_kernel_args[0]), image_3D_arg_info, sizeof(image_3D_arg_info)/sizeof(image_3D_arg_info[0]));
-        test_failed = (error) ? -1 : test_failed;
+        log_error("%d Edge Case Test(s) Failed\n", failed_boundary_tests);
     }
 
-    return test_failed;
+    return (failed_scalar_tests + failed_image_tests + failed_pipe_tests
+            + failed_boundary_tests);
+}
+
+int test_get_kernel_arg_info(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
+{
+    int failed_tests = run_all_tests(context, deviceID);
+    if (failed_tests != 0)
+    {
+        log_error("%d Test(s) Failed\n", failed_tests);
+        return TEST_FAIL;
+    }
+    else
+    {
+        return TEST_PASS;
+    }
 }
diff --git a/test_conformance/api/test_kernel_arg_info_compatibility.cpp b/test_conformance/api/test_kernel_arg_info_compatibility.cpp
deleted file mode 100644
index a6b60c265e..0000000000
--- a/test_conformance/api/test_kernel_arg_info_compatibility.cpp
+++ /dev/null
@@ -1,5159 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "testBase.h"
-#include <limits.h>
-#include <ctype.h>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#define ARG_INFO_FIELD_COUNT        5
-
-#define ARG_INFO_ADDR_OFFSET        1
-#define ARG_INFO_ACCESS_OFFSET        2
-#define ARG_INFO_TYPE_QUAL_OFFSET    3
-#define ARG_INFO_TYPE_NAME_OFFSET    4
-#define ARG_INFO_ARG_NAME_OFFSET    5
-
-typedef char const * kernel_args_t[];
-
-static kernel_args_t required_kernel_args = {
-    "typedef float4 typedef_type;\n"
-    "\n"
-    "typedef struct struct_type {\n"
-    "    float4 float4d;\n"
-    "    int intd;\n"
-    "} typedef_struct_type;\n"
-    "\n"
-    "typedef union union_type {\n"
-    "    float4 float4d;\n"
-    "    uint4 uint4d;\n"
-    "} typedef_union_type;\n"
-    "\n"
-    "typedef enum enum_type {\n"
-    "    enum_type_zero,\n"
-    "    enum_type_one,\n"
-    "    enum_type_two\n"
-    "} typedef_enum_type;\n"
-    "\n"
-    "kernel void constant_scalar_p0(constant void*constantvoidp,\n"
-    "                              constant char *constantcharp,\n"
-    "                              constant uchar* constantucharp,\n"
-    "                              constant unsigned char * constantunsignedcharp)\n"
-  "{}\n",
-    "kernel void constant_scalar_p1(constant short*constantshortp,\n"
-    "                              constant ushort *constantushortp,\n"
-    "                              constant unsigned short* constantunsignedshortp,\n"
-    "                              constant int * constantintp)\n"
-  "{}\n",
-    "kernel void constant_scalar_p2(constant uint*constantuintp,\n"
-    "                              constant unsigned int *constantunsignedintp,\n"
-    "                              constant long* constantlongp,\n"
-    "                              constant ulong * constantulongp)\n"
-  "{}\n",
-    "kernel void constant_scalar_p3(constant unsigned long*constantunsignedlongp,\n"
-    "                              constant float *constantfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_scalar_restrict_p0(constant void* restrict constantvoidrestrictp,\n"
-    "                                       constant char * restrict constantcharrestrictp,\n"
-    "                                       constant uchar*restrict constantucharrestrictp,\n"
-    "                                       constant unsigned char *restrict constantunsignedcharrestrictp)\n"
-    "{}\n",
-    "kernel void constant_scalar_restrict_p1(constant short* restrict constantshortrestrictp,\n"
-    "                                       constant ushort * restrict constantushortrestrictp,\n"
-    "                                       constant unsigned short*restrict constantunsignedshortrestrictp,\n"
-    "                                       constant int *restrict constantintrestrictp)\n"
-    "{}\n",
-    "kernel void constant_scalar_restrict_p2(constant uint* restrict constantuintrestrictp,\n"
-    "                                       constant unsigned int * restrict constantunsignedintrestrictp,\n"
-    "                                       constant long*restrict constantlongrestrictp,\n"
-    "                                       constant ulong *restrict constantulongrestrictp)\n"
-    "{}\n",
-    "kernel void constant_scalar_restrict_p3(constant unsigned long* restrict constantunsignedlongrestrictp,\n"
-    "                                       constant float * restrict constantfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_scalar_p(global void*globalvoidp,\n"
-    "                            global char *globalcharp,\n"
-    "                            global uchar* globalucharp,\n"
-    "                            global unsigned char * globalunsignedcharp,\n"
-    "                            global short*globalshortp,\n"
-    "                            global ushort *globalushortp,\n"
-    "                            global unsigned short* globalunsignedshortp,\n"
-    "                            global int * globalintp,\n"
-    "                            global uint*globaluintp,\n"
-    "                            global unsigned int *globalunsignedintp,\n"
-    "                            global long* globallongp,\n"
-    "                            global ulong * globalulongp,\n"
-    "                            global unsigned long*globalunsignedlongp,\n"
-    "                            global float *globalfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_scalar_restrict_p(global void* restrict globalvoidrestrictp,\n"
-    "                                     global char * restrict globalcharrestrictp,\n"
-    "                                     global uchar*restrict globalucharrestrictp,\n"
-    "                                     global unsigned char *restrict globalunsignedcharrestrictp,\n"
-    "                                     global short* restrict globalshortrestrictp,\n"
-    "                                     global ushort * restrict globalushortrestrictp,\n"
-    "                                     global unsigned short*restrict globalunsignedshortrestrictp,\n"
-    "                                     global int *restrict globalintrestrictp,\n"
-    "                                     global uint* restrict globaluintrestrictp,\n"
-    "                                     global unsigned int * restrict globalunsignedintrestrictp,\n"
-    "                                     global long*restrict globallongrestrictp,\n"
-    "                                     global ulong *restrict globalulongrestrictp,\n"
-    "                                     global unsigned long* restrict globalunsignedlongrestrictp,\n"
-    "                                     global float * restrict globalfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_scalar_p(global const void*globalconstvoidp,\n"
-    "                                  global const char *globalconstcharp,\n"
-    "                                  global const uchar* globalconstucharp,\n"
-    "                                  global const unsigned char * globalconstunsignedcharp,\n"
-    "                                  global const short*globalconstshortp,\n"
-    "                                  global const ushort *globalconstushortp,\n"
-    "                                  global const unsigned short* globalconstunsignedshortp,\n"
-    "                                  global const int * globalconstintp,\n"
-    "                                  global const uint*globalconstuintp,\n"
-    "                                  global const unsigned int *globalconstunsignedintp,\n"
-    "                                  global const long* globalconstlongp,\n"
-    "                                  global const ulong * globalconstulongp,\n"
-    "                                  global const unsigned long*globalconstunsignedlongp,\n"
-    "                                  global const float *globalconstfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_scalar_restrict_p(global const void* restrict globalconstvoidrestrictp,\n"
-    "                                           global const char * restrict globalconstcharrestrictp,\n"
-    "                                           global const uchar*restrict globalconstucharrestrictp,\n"
-    "                                           global const unsigned char *restrict globalconstunsignedcharrestrictp,\n"
-    "                                           global const short* restrict globalconstshortrestrictp,\n"
-    "                                           global const ushort * restrict globalconstushortrestrictp,\n"
-    "                                           global const unsigned short*restrict globalconstunsignedshortrestrictp,\n"
-    "                                           global const int *restrict globalconstintrestrictp,\n"
-    "                                           global const uint* restrict globalconstuintrestrictp,\n"
-    "                                           global const unsigned int * restrict globalconstunsignedintrestrictp,\n"
-    "                                           global const long*restrict globalconstlongrestrictp,\n"
-    "                                           global const ulong *restrict globalconstulongrestrictp,\n"
-    "                                           global const unsigned long* restrict globalconstunsignedlongrestrictp,\n"
-    "                                           global const float * restrict globalconstfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_scalar_p(global volatile void*globalvolatilevoidp,\n"
-    "                                     global volatile char *globalvolatilecharp,\n"
-    "                                     global volatile uchar* globalvolatileucharp,\n"
-    "                                     global volatile unsigned char * globalvolatileunsignedcharp,\n"
-    "                                     global volatile short*globalvolatileshortp,\n"
-    "                                     global volatile ushort *globalvolatileushortp,\n"
-    "                                     global volatile unsigned short* globalvolatileunsignedshortp,\n"
-    "                                     global volatile int * globalvolatileintp,\n"
-    "                                     global volatile uint*globalvolatileuintp,\n"
-    "                                     global volatile unsigned int *globalvolatileunsignedintp,\n"
-    "                                     global volatile long* globalvolatilelongp,\n"
-    "                                     global volatile ulong * globalvolatileulongp,\n"
-    "                                     global volatile unsigned long*globalvolatileunsignedlongp,\n"
-    "                                     global volatile float *globalvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_scalar_restrict_p(global volatile void* restrict globalvolatilevoidrestrictp,\n"
-    "                                              global volatile char * restrict globalvolatilecharrestrictp,\n"
-    "                                              global volatile uchar*restrict globalvolatileucharrestrictp,\n"
-    "                                              global volatile unsigned char *restrict globalvolatileunsignedcharrestrictp,\n"
-    "                                              global volatile short* restrict globalvolatileshortrestrictp,\n"
-    "                                              global volatile ushort * restrict globalvolatileushortrestrictp,\n"
-    "                                              global volatile unsigned short*restrict globalvolatileunsignedshortrestrictp,\n"
-    "                                              global volatile int *restrict globalvolatileintrestrictp,\n"
-    "                                              global volatile uint* restrict globalvolatileuintrestrictp,\n"
-    "                                              global volatile unsigned int * restrict globalvolatileunsignedintrestrictp,\n"
-    "                                              global volatile long*restrict globalvolatilelongrestrictp,\n"
-    "                                              global volatile ulong *restrict globalvolatileulongrestrictp,\n"
-    "                                              global volatile unsigned long* restrict globalvolatileunsignedlongrestrictp,\n"
-    "                                              global volatile float * restrict globalvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_scalar_p(global const volatile void*globalconstvolatilevoidp,\n"
-    "                                           global const volatile char *globalconstvolatilecharp,\n"
-    "                                           global const volatile uchar* globalconstvolatileucharp,\n"
-    "                                           global const volatile unsigned char * globalconstvolatileunsignedcharp,\n"
-    "                                           global const volatile short*globalconstvolatileshortp,\n"
-    "                                           global const volatile ushort *globalconstvolatileushortp,\n"
-    "                                           global const volatile unsigned short* globalconstvolatileunsignedshortp,\n"
-    "                                           global const volatile int * globalconstvolatileintp,\n"
-    "                                           global const volatile uint*globalconstvolatileuintp,\n"
-    "                                           global const volatile unsigned int *globalconstvolatileunsignedintp,\n"
-    "                                           global const volatile long* globalconstvolatilelongp,\n"
-    "                                           global const volatile ulong * globalconstvolatileulongp,\n"
-    "                                           global const volatile unsigned long*globalconstvolatileunsignedlongp,\n"
-    "                                           global const volatile float *globalconstvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_scalar_restrict_p(global const volatile void* restrict globalconstvolatilevoidrestrictp,\n"
-    "                                                    global const volatile char * restrict globalconstvolatilecharrestrictp,\n"
-    "                                                    global const volatile uchar*restrict globalconstvolatileucharrestrictp,\n"
-    "                                                    global const volatile unsigned char *restrict globalconstvolatileunsignedcharrestrictp,\n"
-    "                                                    global const volatile short* restrict globalconstvolatileshortrestrictp,\n"
-    "                                                    global const volatile ushort * restrict globalconstvolatileushortrestrictp,\n"
-    "                                                    global const volatile unsigned short*restrict globalconstvolatileunsignedshortrestrictp,\n"
-    "                                                    global const volatile int *restrict globalconstvolatileintrestrictp,\n"
-    "                                                    global const volatile uint* restrict globalconstvolatileuintrestrictp,\n"
-    "                                                    global const volatile unsigned int * restrict globalconstvolatileunsignedintrestrictp,\n"
-    "                                                    global const volatile long*restrict globalconstvolatilelongrestrictp,\n"
-    "                                                    global const volatile ulong *restrict globalconstvolatileulongrestrictp,\n"
-    "                                                    global const volatile unsigned long* restrict globalconstvolatileunsignedlongrestrictp,\n"
-    "                                                    global const volatile float * restrict globalconstvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_scalar_p(local void*localvoidp,\n"
-    "                           local char *localcharp,\n"
-    "                           local uchar* localucharp,\n"
-    "                           local unsigned char * localunsignedcharp,\n"
-    "                           local short*localshortp,\n"
-    "                           local ushort *localushortp,\n"
-    "                           local unsigned short* localunsignedshortp,\n"
-    "                           local int * localintp,\n"
-    "                           local uint*localuintp,\n"
-    "                           local unsigned int *localunsignedintp,\n"
-    "                           local long* locallongp,\n"
-    "                           local ulong * localulongp,\n"
-    "                           local unsigned long*localunsignedlongp,\n"
-    "                           local float *localfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_scalar_restrict_p(local void* restrict localvoidrestrictp,\n"
-    "                                    local char * restrict localcharrestrictp,\n"
-    "                                    local uchar*restrict localucharrestrictp,\n"
-    "                                    local unsigned char *restrict localunsignedcharrestrictp,\n"
-    "                                    local short* restrict localshortrestrictp,\n"
-    "                                    local ushort * restrict localushortrestrictp,\n"
-    "                                    local unsigned short*restrict localunsignedshortrestrictp,\n"
-    "                                    local int *restrict localintrestrictp,\n"
-    "                                    local uint* restrict localuintrestrictp,\n"
-    "                                    local unsigned int * restrict localunsignedintrestrictp,\n"
-    "                                    local long*restrict locallongrestrictp,\n"
-    "                                    local ulong *restrict localulongrestrictp,\n"
-    "                                    local unsigned long* restrict localunsignedlongrestrictp,\n"
-    "                                    local float * restrict localfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_scalar_p(local const void*localconstvoidp,\n"
-    "                                 local const char *localconstcharp,\n"
-    "                                 local const uchar* localconstucharp,\n"
-    "                                 local const unsigned char * localconstunsignedcharp,\n"
-    "                                 local const short*localconstshortp,\n"
-    "                                 local const ushort *localconstushortp,\n"
-    "                                 local const unsigned short* localconstunsignedshortp,\n"
-    "                                 local const int * localconstintp,\n"
-    "                                 local const uint*localconstuintp,\n"
-    "                                 local const unsigned int *localconstunsignedintp,\n"
-    "                                 local const long* localconstlongp,\n"
-    "                                 local const ulong * localconstulongp,\n"
-    "                                 local const unsigned long*localconstunsignedlongp,\n"
-    "                                 local const float *localconstfloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_scalar_restrict_p(local const void* restrict localconstvoidrestrictp,\n"
-    "                                          local const char * restrict localconstcharrestrictp,\n"
-    "                                          local const uchar*restrict localconstucharrestrictp,\n"
-    "                                          local const unsigned char *restrict localconstunsignedcharrestrictp,\n"
-    "                                          local const short* restrict localconstshortrestrictp,\n"
-    "                                          local const ushort * restrict localconstushortrestrictp,\n"
-    "                                          local const unsigned short*restrict localconstunsignedshortrestrictp,\n"
-    "                                          local const int *restrict localconstintrestrictp,\n"
-    "                                          local const uint* restrict localconstuintrestrictp,\n"
-    "                                          local const unsigned int * restrict localconstunsignedintrestrictp,\n"
-    "                                          local const long*restrict localconstlongrestrictp,\n"
-    "                                          local const ulong *restrict localconstulongrestrictp,\n"
-    "                                          local const unsigned long* restrict localconstunsignedlongrestrictp,\n"
-    "                                          local const float * restrict localconstfloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_scalar_p(local volatile void*localvolatilevoidp,\n"
-    "                                    local volatile char *localvolatilecharp,\n"
-    "                                    local volatile uchar* localvolatileucharp,\n"
-    "                                    local volatile unsigned char * localvolatileunsignedcharp,\n"
-    "                                    local volatile short*localvolatileshortp,\n"
-    "                                    local volatile ushort *localvolatileushortp,\n"
-    "                                    local volatile unsigned short* localvolatileunsignedshortp,\n"
-    "                                    local volatile int * localvolatileintp,\n"
-    "                                    local volatile uint*localvolatileuintp,\n"
-    "                                    local volatile unsigned int *localvolatileunsignedintp,\n"
-    "                                    local volatile long* localvolatilelongp,\n"
-    "                                    local volatile ulong * localvolatileulongp,\n"
-    "                                    local volatile unsigned long*localvolatileunsignedlongp,\n"
-    "                                    local volatile float *localvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_scalar_restrict_p(local volatile void* restrict localvolatilevoidrestrictp,\n"
-    "                                             local volatile char * restrict localvolatilecharrestrictp,\n"
-    "                                             local volatile uchar*restrict localvolatileucharrestrictp,\n"
-    "                                             local volatile unsigned char *restrict localvolatileunsignedcharrestrictp,\n"
-    "                                             local volatile short* restrict localvolatileshortrestrictp,\n"
-    "                                             local volatile ushort * restrict localvolatileushortrestrictp,\n"
-    "                                             local volatile unsigned short*restrict localvolatileunsignedshortrestrictp,\n"
-    "                                             local volatile int *restrict localvolatileintrestrictp,\n"
-    "                                             local volatile uint* restrict localvolatileuintrestrictp,\n"
-    "                                             local volatile unsigned int * restrict localvolatileunsignedintrestrictp,\n"
-    "                                             local volatile long*restrict localvolatilelongrestrictp,\n"
-    "                                             local volatile ulong *restrict localvolatileulongrestrictp,\n"
-    "                                             local volatile unsigned long* restrict localvolatileunsignedlongrestrictp,\n"
-    "                                             local volatile float * restrict localvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_scalar_p(local const volatile void*localconstvolatilevoidp,\n"
-    "                                          local const volatile char *localconstvolatilecharp,\n"
-    "                                          local const volatile uchar* localconstvolatileucharp,\n"
-    "                                          local const volatile unsigned char * localconstvolatileunsignedcharp,\n"
-    "                                          local const volatile short*localconstvolatileshortp,\n"
-    "                                          local const volatile ushort *localconstvolatileushortp,\n"
-    "                                          local const volatile unsigned short* localconstvolatileunsignedshortp,\n"
-    "                                          local const volatile int * localconstvolatileintp,\n"
-    "                                          local const volatile uint*localconstvolatileuintp,\n"
-    "                                          local const volatile unsigned int *localconstvolatileunsignedintp,\n"
-    "                                          local const volatile long* localconstvolatilelongp,\n"
-    "                                          local const volatile ulong * localconstvolatileulongp,\n"
-    "                                          local const volatile unsigned long*localconstvolatileunsignedlongp,\n"
-    "                                          local const volatile float *localconstvolatilefloatp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_scalar_restrict_p(local const volatile void* restrict localconstvolatilevoidrestrictp,\n"
-    "                                                   local const volatile char * restrict localconstvolatilecharrestrictp,\n"
-    "                                                   local const volatile uchar*restrict localconstvolatileucharrestrictp,\n"
-    "                                                   local const volatile unsigned char *restrict localconstvolatileunsignedcharrestrictp,\n"
-    "                                                   local const volatile short* restrict localconstvolatileshortrestrictp,\n"
-    "                                                   local const volatile ushort * restrict localconstvolatileushortrestrictp,\n"
-    "                                                   local const volatile unsigned short*restrict localconstvolatileunsignedshortrestrictp,\n"
-    "                                                   local const volatile int *restrict localconstvolatileintrestrictp,\n"
-    "                                                   local const volatile uint* restrict localconstvolatileuintrestrictp,\n"
-    "                                                   local const volatile unsigned int * restrict localconstvolatileunsignedintrestrictp,\n"
-    "                                                   local const volatile long*restrict localconstvolatilelongrestrictp,\n"
-    "                                                   local const volatile ulong *restrict localconstvolatileulongrestrictp,\n"
-    "                                                   local const volatile unsigned long* restrict localconstvolatileunsignedlongrestrictp,\n"
-    "                                                   local const volatile float * restrict localconstvolatilefloatrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void scalar_d(char chard,\n"
-    "                     uchar uchard,\n"
-    "                     unsigned char unsignedchard,\n"
-    "                     short shortd,\n"
-    "                     ushort ushortd,\n"
-    "                     unsigned short unsignedshortd,\n"
-    "                     int intd,\n"
-    "                     uint uintd,\n"
-    "                     unsigned int unsignedintd,\n"
-    "                     long longd,\n"
-    "                     ulong ulongd,\n"
-    "                     unsigned long unsignedlongd,\n"
-    "                     float floatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_scalar_d(const char constchard,\n"
-    "                           const uchar constuchard,\n"
-    "                           const unsigned char constunsignedchard,\n"
-    "                           const short constshortd,\n"
-    "                           const ushort constushortd,\n"
-    "                           const unsigned short constunsignedshortd,\n"
-    "                           const int constintd,\n"
-    "                           const uint constuintd,\n"
-    "                           const unsigned int constunsignedintd,\n"
-    "                           const long constlongd,\n"
-    "                           const ulong constulongd,\n"
-    "                           const unsigned long constunsignedlongd,\n"
-    "                           const float constfloatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_scalar_d(private char privatechard,\n"
-    "                             private uchar privateuchard,\n"
-    "                             private unsigned char privateunsignedchard,\n"
-    "                             private short privateshortd,\n"
-    "                             private ushort privateushortd,\n"
-    "                             private unsigned short privateunsignedshortd,\n"
-    "                             private int privateintd,\n"
-    "                             private uint privateuintd,\n"
-    "                             private unsigned int privateunsignedintd,\n"
-    "                             private long privatelongd,\n"
-    "                             private ulong privateulongd,\n"
-    "                             private unsigned long privateunsignedlongd,\n"
-    "                             private float privatefloatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_scalar_d(private const char privateconstchard,\n"
-    "                                   private const uchar privateconstuchard,\n"
-    "                                   private const unsigned char privateconstunsignedchard,\n"
-    "                                   private const short privateconstshortd,\n"
-    "                                   private const ushort privateconstushortd,\n"
-    "                                   private const unsigned short privateconstunsignedshortd,\n"
-    "                                   private const int privateconstintd,\n"
-    "                                   private const uint privateconstuintd,\n"
-    "                                   private const unsigned int privateconstunsignedintd,\n"
-    "                                   private const long privateconstlongd,\n"
-    "                                   private const ulong privateconstulongd,\n"
-    "                                   private const unsigned long privateconstunsignedlongd,\n"
-    "                                   private const float privateconstfloatd)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector2_p0(constant char2*constantchar2p,\n"
-    "                               constant uchar2 *constantuchar2p,\n"
-    "                               constant short2* constantshort2p,\n"
-    "                               constant ushort2 * constantushort2p)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_p1(constant int2*constantint2p,\n"
-    "                               constant uint2 *constantuint2p,\n"
-    "                               constant long2* constantlong2p,\n"
-    "                               constant ulong2 * constantulong2p)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_p2(constant float2*constantfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector2_restrict_p0(constant char2 *restrict constantchar2restrictp,\n"
-    "                                        constant uchar2* restrict constantuchar2restrictp,\n"
-    "                                        constant short2 * restrict constantshort2restrictp,\n"
-    "                                        constant ushort2*restrict constantushort2restrictp)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_restrict_p1(constant int2 *restrict constantint2restrictp,\n"
-    "                                        constant uint2* restrict constantuint2restrictp,\n"
-    "                                        constant long2 * restrict constantlong2restrictp,\n"
-    "                                        constant ulong2*restrict constantulong2restrictp)\n"
-  "{}\n",
-    "\n"
-    "kernel void constant_vector2_restrict_p2(constant float2 *restrict constantfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector2_p(global char2*globalchar2p,\n"
-    "                             global uchar2 *globaluchar2p,\n"
-    "                             global short2* globalshort2p,\n"
-    "                             global ushort2 * globalushort2p,\n"
-    "                             global int2*globalint2p,\n"
-    "                             global uint2 *globaluint2p,\n"
-    "                             global long2* globallong2p,\n"
-    "                             global ulong2 * globalulong2p,\n"
-    "                             global float2*globalfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector2_restrict_p(global char2 *restrict globalchar2restrictp,\n"
-    "                                      global uchar2* restrict globaluchar2restrictp,\n"
-    "                                      global short2 * restrict globalshort2restrictp,\n"
-    "                                      global ushort2*restrict globalushort2restrictp,\n"
-    "                                      global int2 *restrict globalint2restrictp,\n"
-    "                                      global uint2* restrict globaluint2restrictp,\n"
-    "                                      global long2 * restrict globallong2restrictp,\n"
-    "                                      global ulong2*restrict globalulong2restrictp,\n"
-    "                                      global float2 *restrict globalfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector2_p(global const char2* globalconstchar2p,\n"
-    "                                   global const uchar2 * globalconstuchar2p,\n"
-    "                                   global const short2*globalconstshort2p,\n"
-    "                                   global const ushort2 *globalconstushort2p,\n"
-    "                                   global const int2* globalconstint2p,\n"
-    "                                   global const uint2 * globalconstuint2p,\n"
-    "                                   global const long2*globalconstlong2p,\n"
-    "                                   global const ulong2 *globalconstulong2p,\n"
-    "                                   global const float2* globalconstfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector2_restrict_p(global const char2 * restrict globalconstchar2restrictp,\n"
-    "                                            global const uchar2*restrict globalconstuchar2restrictp,\n"
-    "                                            global const short2 *restrict globalconstshort2restrictp,\n"
-    "                                            global const ushort2* restrict globalconstushort2restrictp,\n"
-    "                                            global const int2 * restrict globalconstint2restrictp,\n"
-    "                                            global const uint2*restrict globalconstuint2restrictp,\n"
-    "                                            global const long2 *restrict globalconstlong2restrictp,\n"
-    "                                            global const ulong2* restrict globalconstulong2restrictp,\n"
-    "                                            global const float2 * restrict globalconstfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector2_p(global volatile char2*globalvolatilechar2p,\n"
-    "                                      global volatile uchar2 *globalvolatileuchar2p,\n"
-    "                                      global volatile short2* globalvolatileshort2p,\n"
-    "                                      global volatile ushort2 * globalvolatileushort2p,\n"
-    "                                      global volatile int2*globalvolatileint2p,\n"
-    "                                      global volatile uint2 *globalvolatileuint2p,\n"
-    "                                      global volatile long2* globalvolatilelong2p,\n"
-    "                                      global volatile ulong2 * globalvolatileulong2p,\n"
-    "                                      global volatile float2*globalvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector2_restrict_p(global volatile char2 *restrict globalvolatilechar2restrictp,\n"
-    "                                               global volatile uchar2* restrict globalvolatileuchar2restrictp,\n"
-    "                                               global volatile short2 * restrict globalvolatileshort2restrictp,\n"
-    "                                               global volatile ushort2*restrict globalvolatileushort2restrictp,\n"
-    "                                               global volatile int2 *restrict globalvolatileint2restrictp,\n"
-    "                                               global volatile uint2* restrict globalvolatileuint2restrictp,\n"
-    "                                               global volatile long2 * restrict globalvolatilelong2restrictp,\n"
-    "                                               global volatile ulong2*restrict globalvolatileulong2restrictp,\n"
-    "                                               global volatile float2 *restrict globalvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector2_p(global const volatile char2* globalconstvolatilechar2p,\n"
-    "                                            global const volatile uchar2 * globalconstvolatileuchar2p,\n"
-    "                                            global const volatile short2*globalconstvolatileshort2p,\n"
-    "                                            global const volatile ushort2 *globalconstvolatileushort2p,\n"
-    "                                            global const volatile int2* globalconstvolatileint2p,\n"
-    "                                            global const volatile uint2 * globalconstvolatileuint2p,\n"
-    "                                            global const volatile long2*globalconstvolatilelong2p,\n"
-    "                                            global const volatile ulong2 *globalconstvolatileulong2p,\n"
-    "                                            global const volatile float2* globalconstvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector2_restrict_p(global const volatile char2 * restrict globalconstvolatilechar2restrictp,\n"
-    "                                                     global const volatile uchar2*restrict globalconstvolatileuchar2restrictp,\n"
-    "                                                     global const volatile short2 *restrict globalconstvolatileshort2restrictp,\n"
-    "                                                     global const volatile ushort2* restrict globalconstvolatileushort2restrictp,\n"
-    "                                                     global const volatile int2 * restrict globalconstvolatileint2restrictp,\n"
-    "                                                     global const volatile uint2*restrict globalconstvolatileuint2restrictp,\n"
-    "                                                     global const volatile long2 *restrict globalconstvolatilelong2restrictp,\n"
-    "                                                     global const volatile ulong2* restrict globalconstvolatileulong2restrictp,\n"
-    "                                                     global const volatile float2 * restrict globalconstvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector2_p(local char2*localchar2p,\n"
-    "                            local uchar2 *localuchar2p,\n"
-    "                            local short2* localshort2p,\n"
-    "                            local ushort2 * localushort2p,\n"
-    "                            local int2*localint2p,\n"
-    "                            local uint2 *localuint2p,\n"
-    "                            local long2* locallong2p,\n"
-    "                            local ulong2 * localulong2p,\n"
-    "                            local float2*localfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector2_restrict_p(local char2 *restrict localchar2restrictp,\n"
-    "                                     local uchar2* restrict localuchar2restrictp,\n"
-    "                                     local short2 * restrict localshort2restrictp,\n"
-    "                                     local ushort2*restrict localushort2restrictp,\n"
-    "                                     local int2 *restrict localint2restrictp,\n"
-    "                                     local uint2* restrict localuint2restrictp,\n"
-    "                                     local long2 * restrict locallong2restrictp,\n"
-    "                                     local ulong2*restrict localulong2restrictp,\n"
-    "                                     local float2 *restrict localfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector2_p(local const char2* localconstchar2p,\n"
-    "                                  local const uchar2 * localconstuchar2p,\n"
-    "                                  local const short2*localconstshort2p,\n"
-    "                                  local const ushort2 *localconstushort2p,\n"
-    "                                  local const int2* localconstint2p,\n"
-    "                                  local const uint2 * localconstuint2p,\n"
-    "                                  local const long2*localconstlong2p,\n"
-    "                                  local const ulong2 *localconstulong2p,\n"
-    "                                  local const float2* localconstfloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector2_restrict_p(local const char2 * restrict localconstchar2restrictp,\n"
-    "                                           local const uchar2*restrict localconstuchar2restrictp,\n"
-    "                                           local const short2 *restrict localconstshort2restrictp,\n"
-    "                                           local const ushort2* restrict localconstushort2restrictp,\n"
-    "                                           local const int2 * restrict localconstint2restrictp,\n"
-    "                                           local const uint2*restrict localconstuint2restrictp,\n"
-    "                                           local const long2 *restrict localconstlong2restrictp,\n"
-    "                                           local const ulong2* restrict localconstulong2restrictp,\n"
-    "                                           local const float2 * restrict localconstfloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector2_p(local volatile char2*localvolatilechar2p,\n"
-    "                                     local volatile uchar2 *localvolatileuchar2p,\n"
-    "                                     local volatile short2* localvolatileshort2p,\n"
-    "                                     local volatile ushort2 * localvolatileushort2p,\n"
-    "                                     local volatile int2*localvolatileint2p,\n"
-    "                                     local volatile uint2 *localvolatileuint2p,\n"
-    "                                     local volatile long2* localvolatilelong2p,\n"
-    "                                     local volatile ulong2 * localvolatileulong2p,\n"
-    "                                     local volatile float2*localvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector2_restrict_p(local volatile char2 *restrict localvolatilechar2restrictp,\n"
-    "                                              local volatile uchar2* restrict localvolatileuchar2restrictp,\n"
-    "                                              local volatile short2 * restrict localvolatileshort2restrictp,\n"
-    "                                              local volatile ushort2*restrict localvolatileushort2restrictp,\n"
-    "                                              local volatile int2 *restrict localvolatileint2restrictp,\n"
-    "                                              local volatile uint2* restrict localvolatileuint2restrictp,\n"
-    "                                              local volatile long2 * restrict localvolatilelong2restrictp,\n"
-    "                                              local volatile ulong2*restrict localvolatileulong2restrictp,\n"
-    "                                              local volatile float2 *restrict localvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector2_p(local const volatile char2* localconstvolatilechar2p,\n"
-    "                                           local const volatile uchar2 * localconstvolatileuchar2p,\n"
-    "                                           local const volatile short2*localconstvolatileshort2p,\n"
-    "                                           local const volatile ushort2 *localconstvolatileushort2p,\n"
-    "                                           local const volatile int2* localconstvolatileint2p,\n"
-    "                                           local const volatile uint2 * localconstvolatileuint2p,\n"
-    "                                           local const volatile long2*localconstvolatilelong2p,\n"
-    "                                           local const volatile ulong2 *localconstvolatileulong2p,\n"
-    "                                           local const volatile float2* localconstvolatilefloat2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector2_restrict_p(local const volatile char2 * restrict localconstvolatilechar2restrictp,\n"
-    "                                                    local const volatile uchar2*restrict localconstvolatileuchar2restrictp,\n"
-    "                                                    local const volatile short2 *restrict localconstvolatileshort2restrictp,\n"
-    "                                                    local const volatile ushort2* restrict localconstvolatileushort2restrictp,\n"
-    "                                                    local const volatile int2 * restrict localconstvolatileint2restrictp,\n"
-    "                                                    local const volatile uint2*restrict localconstvolatileuint2restrictp,\n"
-    "                                                    local const volatile long2 *restrict localconstvolatilelong2restrictp,\n"
-    "                                                    local const volatile ulong2* restrict localconstvolatileulong2restrictp,\n"
-    "                                                    local const volatile float2 * restrict localconstvolatilefloat2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector2_d(char2 char2d,\n"
-    "                      uchar2 uchar2d,\n"
-    "                      short2 short2d,\n"
-    "                      ushort2 ushort2d,\n"
-    "                      int2 int2d,\n"
-    "                      uint2 uint2d,\n"
-    "                      long2 long2d,\n"
-    "                      ulong2 ulong2d,\n"
-    "                      float2 float2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector2_d(const char2 constchar2d,\n"
-    "                            const uchar2 constuchar2d,\n"
-    "                            const short2 constshort2d,\n"
-    "                            const ushort2 constushort2d,\n"
-    "                            const int2 constint2d,\n"
-    "                            const uint2 constuint2d,\n"
-    "                            const long2 constlong2d,\n"
-    "                            const ulong2 constulong2d,\n"
-    "                            const float2 constfloat2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector2_d(private char2 privatechar2d,\n"
-    "                              private uchar2 privateuchar2d,\n"
-    "                              private short2 privateshort2d,\n"
-    "                              private ushort2 privateushort2d,\n"
-    "                              private int2 privateint2d,\n"
-    "                              private uint2 privateuint2d,\n"
-    "                              private long2 privatelong2d,\n"
-    "                              private ulong2 privateulong2d,\n"
-    "                              private float2 privatefloat2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector2_d(private const char2 privateconstchar2d,\n"
-    "                                    private const uchar2 privateconstuchar2d,\n"
-    "                                    private const short2 privateconstshort2d,\n"
-    "                                    private const ushort2 privateconstushort2d,\n"
-    "                                    private const int2 privateconstint2d,\n"
-    "                                    private const uint2 privateconstuint2d,\n"
-    "                                    private const long2 privateconstlong2d,\n"
-    "                                    private const ulong2 privateconstulong2d,\n"
-    "                                    private const float2 privateconstfloat2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_p0(constant char3*constantchar3p,\n"
-    "                               constant uchar3 *constantuchar3p,\n"
-    "                               constant short3* constantshort3p,\n"
-    "                               constant ushort3 * constantushort3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_p1(constant int3*constantint3p,\n"
-    "                               constant uint3 *constantuint3p,\n"
-    "                               constant long3* constantlong3p,\n"
-    "                               constant ulong3 * constantulong3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_p2(constant float3*constantfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_restrict_p0(constant char3 *restrict constantchar3restrictp,\n"
-    "                                        constant uchar3* restrict constantuchar3restrictp,\n"
-    "                                        constant short3 * restrict constantshort3restrictp,\n"
-    "                                        constant ushort3*restrict constantushort3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_restrict_p1(constant int3 *restrict constantint3restrictp,\n"
-    "                                        constant uint3* restrict constantuint3restrictp,\n"
-    "                                        constant long3 * restrict constantlong3restrictp,\n"
-    "                                        constant ulong3*restrict constantulong3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector3_restrict_p2(constant float3 *restrict constantfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector3_p(global char3*globalchar3p,\n"
-    "                             global uchar3 *globaluchar3p,\n"
-    "                             global short3* globalshort3p,\n"
-    "                             global ushort3 * globalushort3p,\n"
-    "                             global int3*globalint3p,\n"
-    "                             global uint3 *globaluint3p,\n"
-    "                             global long3* globallong3p,\n"
-    "                             global ulong3 * globalulong3p,\n"
-    "                             global float3*globalfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector3_restrict_p(global char3 *restrict globalchar3restrictp,\n"
-    "                                      global uchar3* restrict globaluchar3restrictp,\n"
-    "                                      global short3 * restrict globalshort3restrictp,\n"
-    "                                      global ushort3*restrict globalushort3restrictp,\n"
-    "                                      global int3 *restrict globalint3restrictp,\n"
-    "                                      global uint3* restrict globaluint3restrictp,\n"
-    "                                      global long3 * restrict globallong3restrictp,\n"
-    "                                      global ulong3*restrict globalulong3restrictp,\n"
-    "                                      global float3 *restrict globalfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector3_p(global const char3* globalconstchar3p,\n"
-    "                                   global const uchar3 * globalconstuchar3p,\n"
-    "                                   global const short3*globalconstshort3p,\n"
-    "                                   global const ushort3 *globalconstushort3p,\n"
-    "                                   global const int3* globalconstint3p,\n"
-    "                                   global const uint3 * globalconstuint3p,\n"
-    "                                   global const long3*globalconstlong3p,\n"
-    "                                   global const ulong3 *globalconstulong3p,\n"
-    "                                   global const float3* globalconstfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector3_restrict_p(global const char3 * restrict globalconstchar3restrictp,\n"
-    "                                            global const uchar3*restrict globalconstuchar3restrictp,\n"
-    "                                            global const short3 *restrict globalconstshort3restrictp,\n"
-    "                                            global const ushort3* restrict globalconstushort3restrictp,\n"
-    "                                            global const int3 * restrict globalconstint3restrictp,\n"
-    "                                            global const uint3*restrict globalconstuint3restrictp,\n"
-    "                                            global const long3 *restrict globalconstlong3restrictp,\n"
-    "                                            global const ulong3* restrict globalconstulong3restrictp,\n"
-    "                                            global const float3 * restrict globalconstfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector3_p(global volatile char3*globalvolatilechar3p,\n"
-    "                                      global volatile uchar3 *globalvolatileuchar3p,\n"
-    "                                      global volatile short3* globalvolatileshort3p,\n"
-    "                                      global volatile ushort3 * globalvolatileushort3p,\n"
-    "                                      global volatile int3*globalvolatileint3p,\n"
-    "                                      global volatile uint3 *globalvolatileuint3p,\n"
-    "                                      global volatile long3* globalvolatilelong3p,\n"
-    "                                      global volatile ulong3 * globalvolatileulong3p,\n"
-    "                                      global volatile float3*globalvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector3_restrict_p(global volatile char3 *restrict globalvolatilechar3restrictp,\n"
-    "                                               global volatile uchar3* restrict globalvolatileuchar3restrictp,\n"
-    "                                               global volatile short3 * restrict globalvolatileshort3restrictp,\n"
-    "                                               global volatile ushort3*restrict globalvolatileushort3restrictp,\n"
-    "                                               global volatile int3 *restrict globalvolatileint3restrictp,\n"
-    "                                               global volatile uint3* restrict globalvolatileuint3restrictp,\n"
-    "                                               global volatile long3 * restrict globalvolatilelong3restrictp,\n"
-    "                                               global volatile ulong3*restrict globalvolatileulong3restrictp,\n"
-    "                                               global volatile float3 *restrict globalvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector3_p(global const volatile char3* globalconstvolatilechar3p,\n"
-    "                                            global const volatile uchar3 * globalconstvolatileuchar3p,\n"
-    "                                            global const volatile short3*globalconstvolatileshort3p,\n"
-    "                                            global const volatile ushort3 *globalconstvolatileushort3p,\n"
-    "                                            global const volatile int3* globalconstvolatileint3p,\n"
-    "                                            global const volatile uint3 * globalconstvolatileuint3p,\n"
-    "                                            global const volatile long3*globalconstvolatilelong3p,\n"
-    "                                            global const volatile ulong3 *globalconstvolatileulong3p,\n"
-    "                                            global const volatile float3* globalconstvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector3_restrict_p(global const volatile char3 * restrict globalconstvolatilechar3restrictp,\n"
-    "                                                     global const volatile uchar3*restrict globalconstvolatileuchar3restrictp,\n"
-    "                                                     global const volatile short3 *restrict globalconstvolatileshort3restrictp,\n"
-    "                                                     global const volatile ushort3* restrict globalconstvolatileushort3restrictp,\n"
-    "                                                     global const volatile int3 * restrict globalconstvolatileint3restrictp,\n"
-    "                                                     global const volatile uint3*restrict globalconstvolatileuint3restrictp,\n"
-    "                                                     global const volatile long3 *restrict globalconstvolatilelong3restrictp,\n"
-    "                                                     global const volatile ulong3* restrict globalconstvolatileulong3restrictp,\n"
-    "                                                     global const volatile float3 * restrict globalconstvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector3_p(local char3*localchar3p,\n"
-    "                            local uchar3 *localuchar3p,\n"
-    "                            local short3* localshort3p,\n"
-    "                            local ushort3 * localushort3p,\n"
-    "                            local int3*localint3p,\n"
-    "                            local uint3 *localuint3p,\n"
-    "                            local long3* locallong3p,\n"
-    "                            local ulong3 * localulong3p,\n"
-    "                            local float3*localfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector3_restrict_p(local char3 *restrict localchar3restrictp,\n"
-    "                                     local uchar3* restrict localuchar3restrictp,\n"
-    "                                     local short3 * restrict localshort3restrictp,\n"
-    "                                     local ushort3*restrict localushort3restrictp,\n"
-    "                                     local int3 *restrict localint3restrictp,\n"
-    "                                     local uint3* restrict localuint3restrictp,\n"
-    "                                     local long3 * restrict locallong3restrictp,\n"
-    "                                     local ulong3*restrict localulong3restrictp,\n"
-    "                                     local float3 *restrict localfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector3_p(local const char3* localconstchar3p,\n"
-    "                                  local const uchar3 * localconstuchar3p,\n"
-    "                                  local const short3*localconstshort3p,\n"
-    "                                  local const ushort3 *localconstushort3p,\n"
-    "                                  local const int3* localconstint3p,\n"
-    "                                  local const uint3 * localconstuint3p,\n"
-    "                                  local const long3*localconstlong3p,\n"
-    "                                  local const ulong3 *localconstulong3p,\n"
-    "                                  local const float3* localconstfloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector3_restrict_p(local const char3 * restrict localconstchar3restrictp,\n"
-    "                                           local const uchar3*restrict localconstuchar3restrictp,\n"
-    "                                           local const short3 *restrict localconstshort3restrictp,\n"
-    "                                           local const ushort3* restrict localconstushort3restrictp,\n"
-    "                                           local const int3 * restrict localconstint3restrictp,\n"
-    "                                           local const uint3*restrict localconstuint3restrictp,\n"
-    "                                           local const long3 *restrict localconstlong3restrictp,\n"
-    "                                           local const ulong3* restrict localconstulong3restrictp,\n"
-    "                                           local const float3 * restrict localconstfloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector3_p(local volatile char3*localvolatilechar3p,\n"
-    "                                     local volatile uchar3 *localvolatileuchar3p,\n"
-    "                                     local volatile short3* localvolatileshort3p,\n"
-    "                                     local volatile ushort3 * localvolatileushort3p,\n"
-    "                                     local volatile int3*localvolatileint3p,\n"
-    "                                     local volatile uint3 *localvolatileuint3p,\n"
-    "                                     local volatile long3* localvolatilelong3p,\n"
-    "                                     local volatile ulong3 * localvolatileulong3p,\n"
-    "                                     local volatile float3*localvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector3_restrict_p(local volatile char3 *restrict localvolatilechar3restrictp,\n"
-    "                                              local volatile uchar3* restrict localvolatileuchar3restrictp,\n"
-    "                                              local volatile short3 * restrict localvolatileshort3restrictp,\n"
-    "                                              local volatile ushort3*restrict localvolatileushort3restrictp,\n"
-    "                                              local volatile int3 *restrict localvolatileint3restrictp,\n"
-    "                                              local volatile uint3* restrict localvolatileuint3restrictp,\n"
-    "                                              local volatile long3 * restrict localvolatilelong3restrictp,\n"
-    "                                              local volatile ulong3*restrict localvolatileulong3restrictp,\n"
-    "                                              local volatile float3 *restrict localvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector3_p(local const volatile char3* localconstvolatilechar3p,\n"
-    "                                           local const volatile uchar3 * localconstvolatileuchar3p,\n"
-    "                                           local const volatile short3*localconstvolatileshort3p,\n"
-    "                                           local const volatile ushort3 *localconstvolatileushort3p,\n"
-    "                                           local const volatile int3* localconstvolatileint3p,\n"
-    "                                           local const volatile uint3 * localconstvolatileuint3p,\n"
-    "                                           local const volatile long3*localconstvolatilelong3p,\n"
-    "                                           local const volatile ulong3 *localconstvolatileulong3p,\n"
-    "                                           local const volatile float3* localconstvolatilefloat3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector3_restrict_p(local const volatile char3 * restrict localconstvolatilechar3restrictp,\n"
-    "                                                    local const volatile uchar3*restrict localconstvolatileuchar3restrictp,\n"
-    "                                                    local const volatile short3 *restrict localconstvolatileshort3restrictp,\n"
-    "                                                    local const volatile ushort3* restrict localconstvolatileushort3restrictp,\n"
-    "                                                    local const volatile int3 * restrict localconstvolatileint3restrictp,\n"
-    "                                                    local const volatile uint3*restrict localconstvolatileuint3restrictp,\n"
-    "                                                    local const volatile long3 *restrict localconstvolatilelong3restrictp,\n"
-    "                                                    local const volatile ulong3* restrict localconstvolatileulong3restrictp,\n"
-    "                                                    local const volatile float3 * restrict localconstvolatilefloat3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector3_d(char3 char3d,\n"
-    "                      uchar3 uchar3d,\n"
-    "                      short3 short3d,\n"
-    "                      ushort3 ushort3d,\n"
-    "                      int3 int3d,\n"
-    "                      uint3 uint3d,\n"
-    "                      long3 long3d,\n"
-    "                      ulong3 ulong3d,\n"
-    "                      float3 float3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector3_d(const char3 constchar3d,\n"
-    "                            const uchar3 constuchar3d,\n"
-    "                            const short3 constshort3d,\n"
-    "                            const ushort3 constushort3d,\n"
-    "                            const int3 constint3d,\n"
-    "                            const uint3 constuint3d,\n"
-    "                            const long3 constlong3d,\n"
-    "                            const ulong3 constulong3d,\n"
-    "                            const float3 constfloat3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector3_d(private char3 privatechar3d,\n"
-    "                              private uchar3 privateuchar3d,\n"
-    "                              private short3 privateshort3d,\n"
-    "                              private ushort3 privateushort3d,\n"
-    "                              private int3 privateint3d,\n"
-    "                              private uint3 privateuint3d,\n"
-    "                              private long3 privatelong3d,\n"
-    "                              private ulong3 privateulong3d,\n"
-    "                              private float3 privatefloat3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector3_d(private const char3 privateconstchar3d,\n"
-    "                                    private const uchar3 privateconstuchar3d,\n"
-    "                                    private const short3 privateconstshort3d,\n"
-    "                                    private const ushort3 privateconstushort3d,\n"
-    "                                    private const int3 privateconstint3d,\n"
-    "                                    private const uint3 privateconstuint3d,\n"
-    "                                    private const long3 privateconstlong3d,\n"
-    "                                    private const ulong3 privateconstulong3d,\n"
-    "                                    private const float3 privateconstfloat3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_p0(constant char4*constantchar4p,\n"
-    "                               constant uchar4 *constantuchar4p,\n"
-    "                               constant short4* constantshort4p,\n"
-    "                               constant ushort4 * constantushort4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_p1(constant int4*constantint4p,\n"
-    "                               constant uint4 *constantuint4p,\n"
-    "                               constant long4* constantlong4p,\n"
-    "                               constant ulong4 * constantulong4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_p2(constant float4*constantfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_restrict_p0(constant char4 *restrict constantchar4restrictp,\n"
-    "                                        constant uchar4* restrict constantuchar4restrictp,\n"
-    "                                        constant short4 * restrict constantshort4restrictp,\n"
-    "                                        constant ushort4*restrict constantushort4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_restrict_p1(constant int4 *restrict constantint4restrictp,\n"
-    "                                        constant uint4* restrict constantuint4restrictp,\n"
-    "                                        constant long4 * restrict constantlong4restrictp,\n"
-    "                                        constant ulong4*restrict constantulong4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector4_restrict_p2(constant float4 *restrict constantfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector4_p(global char4*globalchar4p,\n"
-    "                             global uchar4 *globaluchar4p,\n"
-    "                             global short4* globalshort4p,\n"
-    "                             global ushort4 * globalushort4p,\n"
-    "                             global int4*globalint4p,\n"
-    "                             global uint4 *globaluint4p,\n"
-    "                             global long4* globallong4p,\n"
-    "                             global ulong4 * globalulong4p,\n"
-    "                             global float4*globalfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector4_restrict_p(global char4 *restrict globalchar4restrictp,\n"
-    "                                      global uchar4* restrict globaluchar4restrictp,\n"
-    "                                      global short4 * restrict globalshort4restrictp,\n"
-    "                                      global ushort4*restrict globalushort4restrictp,\n"
-    "                                      global int4 *restrict globalint4restrictp,\n"
-    "                                      global uint4* restrict globaluint4restrictp,\n"
-    "                                      global long4 * restrict globallong4restrictp,\n"
-    "                                      global ulong4*restrict globalulong4restrictp,\n"
-    "                                      global float4 *restrict globalfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector4_p(global const char4* globalconstchar4p,\n"
-    "                                   global const uchar4 * globalconstuchar4p,\n"
-    "                                   global const short4*globalconstshort4p,\n"
-    "                                   global const ushort4 *globalconstushort4p,\n"
-    "                                   global const int4* globalconstint4p,\n"
-    "                                   global const uint4 * globalconstuint4p,\n"
-    "                                   global const long4*globalconstlong4p,\n"
-    "                                   global const ulong4 *globalconstulong4p,\n"
-    "                                   global const float4* globalconstfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector4_restrict_p(global const char4 * restrict globalconstchar4restrictp,\n"
-    "                                            global const uchar4*restrict globalconstuchar4restrictp,\n"
-    "                                            global const short4 *restrict globalconstshort4restrictp,\n"
-    "                                            global const ushort4* restrict globalconstushort4restrictp,\n"
-    "                                            global const int4 * restrict globalconstint4restrictp,\n"
-    "                                            global const uint4*restrict globalconstuint4restrictp,\n"
-    "                                            global const long4 *restrict globalconstlong4restrictp,\n"
-    "                                            global const ulong4* restrict globalconstulong4restrictp,\n"
-    "                                            global const float4 * restrict globalconstfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector4_p(global volatile char4*globalvolatilechar4p,\n"
-    "                                      global volatile uchar4 *globalvolatileuchar4p,\n"
-    "                                      global volatile short4* globalvolatileshort4p,\n"
-    "                                      global volatile ushort4 * globalvolatileushort4p,\n"
-    "                                      global volatile int4*globalvolatileint4p,\n"
-    "                                      global volatile uint4 *globalvolatileuint4p,\n"
-    "                                      global volatile long4* globalvolatilelong4p,\n"
-    "                                      global volatile ulong4 * globalvolatileulong4p,\n"
-    "                                      global volatile float4*globalvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector4_restrict_p(global volatile char4 *restrict globalvolatilechar4restrictp,\n"
-    "                                               global volatile uchar4* restrict globalvolatileuchar4restrictp,\n"
-    "                                               global volatile short4 * restrict globalvolatileshort4restrictp,\n"
-    "                                               global volatile ushort4*restrict globalvolatileushort4restrictp,\n"
-    "                                               global volatile int4 *restrict globalvolatileint4restrictp,\n"
-    "                                               global volatile uint4* restrict globalvolatileuint4restrictp,\n"
-    "                                               global volatile long4 * restrict globalvolatilelong4restrictp,\n"
-    "                                               global volatile ulong4*restrict globalvolatileulong4restrictp,\n"
-    "                                               global volatile float4 *restrict globalvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector4_p(global const volatile char4* globalconstvolatilechar4p,\n"
-    "                                            global const volatile uchar4 * globalconstvolatileuchar4p,\n"
-    "                                            global const volatile short4*globalconstvolatileshort4p,\n"
-    "                                            global const volatile ushort4 *globalconstvolatileushort4p,\n"
-    "                                            global const volatile int4* globalconstvolatileint4p,\n"
-    "                                            global const volatile uint4 * globalconstvolatileuint4p,\n"
-    "                                            global const volatile long4*globalconstvolatilelong4p,\n"
-    "                                            global const volatile ulong4 *globalconstvolatileulong4p,\n"
-    "                                            global const volatile float4* globalconstvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector4_restrict_p(global const volatile char4 * restrict globalconstvolatilechar4restrictp,\n"
-    "                                                     global const volatile uchar4*restrict globalconstvolatileuchar4restrictp,\n"
-    "                                                     global const volatile short4 *restrict globalconstvolatileshort4restrictp,\n"
-    "                                                     global const volatile ushort4* restrict globalconstvolatileushort4restrictp,\n"
-    "                                                     global const volatile int4 * restrict globalconstvolatileint4restrictp,\n"
-    "                                                     global const volatile uint4*restrict globalconstvolatileuint4restrictp,\n"
-    "                                                     global const volatile long4 *restrict globalconstvolatilelong4restrictp,\n"
-    "                                                     global const volatile ulong4* restrict globalconstvolatileulong4restrictp,\n"
-    "                                                     global const volatile float4 * restrict globalconstvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector4_p(local char4*localchar4p,\n"
-    "                            local uchar4 *localuchar4p,\n"
-    "                            local short4* localshort4p,\n"
-    "                            local ushort4 * localushort4p,\n"
-    "                            local int4*localint4p,\n"
-    "                            local uint4 *localuint4p,\n"
-    "                            local long4* locallong4p,\n"
-    "                            local ulong4 * localulong4p,\n"
-    "                            local float4*localfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector4_restrict_p(local char4 *restrict localchar4restrictp,\n"
-    "                                     local uchar4* restrict localuchar4restrictp,\n"
-    "                                     local short4 * restrict localshort4restrictp,\n"
-    "                                     local ushort4*restrict localushort4restrictp,\n"
-    "                                     local int4 *restrict localint4restrictp,\n"
-    "                                     local uint4* restrict localuint4restrictp,\n"
-    "                                     local long4 * restrict locallong4restrictp,\n"
-    "                                     local ulong4*restrict localulong4restrictp,\n"
-    "                                     local float4 *restrict localfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector4_p(local const char4* localconstchar4p,\n"
-    "                                  local const uchar4 * localconstuchar4p,\n"
-    "                                  local const short4*localconstshort4p,\n"
-    "                                  local const ushort4 *localconstushort4p,\n"
-    "                                  local const int4* localconstint4p,\n"
-    "                                  local const uint4 * localconstuint4p,\n"
-    "                                  local const long4*localconstlong4p,\n"
-    "                                  local const ulong4 *localconstulong4p,\n"
-    "                                  local const float4* localconstfloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector4_restrict_p(local const char4 * restrict localconstchar4restrictp,\n"
-    "                                           local const uchar4*restrict localconstuchar4restrictp,\n"
-    "                                           local const short4 *restrict localconstshort4restrictp,\n"
-    "                                           local const ushort4* restrict localconstushort4restrictp,\n"
-    "                                           local const int4 * restrict localconstint4restrictp,\n"
-    "                                           local const uint4*restrict localconstuint4restrictp,\n"
-    "                                           local const long4 *restrict localconstlong4restrictp,\n"
-    "                                           local const ulong4* restrict localconstulong4restrictp,\n"
-    "                                           local const float4 * restrict localconstfloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector4_p(local volatile char4*localvolatilechar4p,\n"
-    "                                     local volatile uchar4 *localvolatileuchar4p,\n"
-    "                                     local volatile short4* localvolatileshort4p,\n"
-    "                                     local volatile ushort4 * localvolatileushort4p,\n"
-    "                                     local volatile int4*localvolatileint4p,\n"
-    "                                     local volatile uint4 *localvolatileuint4p,\n"
-    "                                     local volatile long4* localvolatilelong4p,\n"
-    "                                     local volatile ulong4 * localvolatileulong4p,\n"
-    "                                     local volatile float4*localvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector4_restrict_p(local volatile char4 *restrict localvolatilechar4restrictp,\n"
-    "                                              local volatile uchar4* restrict localvolatileuchar4restrictp,\n"
-    "                                              local volatile short4 * restrict localvolatileshort4restrictp,\n"
-    "                                              local volatile ushort4*restrict localvolatileushort4restrictp,\n"
-    "                                              local volatile int4 *restrict localvolatileint4restrictp,\n"
-    "                                              local volatile uint4* restrict localvolatileuint4restrictp,\n"
-    "                                              local volatile long4 * restrict localvolatilelong4restrictp,\n"
-    "                                              local volatile ulong4*restrict localvolatileulong4restrictp,\n"
-    "                                              local volatile float4 *restrict localvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector4_p(local const volatile char4* localconstvolatilechar4p,\n"
-    "                                           local const volatile uchar4 * localconstvolatileuchar4p,\n"
-    "                                           local const volatile short4*localconstvolatileshort4p,\n"
-    "                                           local const volatile ushort4 *localconstvolatileushort4p,\n"
-    "                                           local const volatile int4* localconstvolatileint4p,\n"
-    "                                           local const volatile uint4 * localconstvolatileuint4p,\n"
-    "                                           local const volatile long4*localconstvolatilelong4p,\n"
-    "                                           local const volatile ulong4 *localconstvolatileulong4p,\n"
-    "                                           local const volatile float4* localconstvolatilefloat4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector4_restrict_p(local const volatile char4 * restrict localconstvolatilechar4restrictp,\n"
-    "                                                    local const volatile uchar4*restrict localconstvolatileuchar4restrictp,\n"
-    "                                                    local const volatile short4 *restrict localconstvolatileshort4restrictp,\n"
-    "                                                    local const volatile ushort4* restrict localconstvolatileushort4restrictp,\n"
-    "                                                    local const volatile int4 * restrict localconstvolatileint4restrictp,\n"
-    "                                                    local const volatile uint4*restrict localconstvolatileuint4restrictp,\n"
-    "                                                    local const volatile long4 *restrict localconstvolatilelong4restrictp,\n"
-    "                                                    local const volatile ulong4* restrict localconstvolatileulong4restrictp,\n"
-    "                                                    local const volatile float4 * restrict localconstvolatilefloat4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector4_d(char4 char4d,\n"
-    "                      uchar4 uchar4d,\n"
-    "                      short4 short4d,\n"
-    "                      ushort4 ushort4d,\n"
-    "                      int4 int4d,\n"
-    "                      uint4 uint4d,\n"
-    "                      long4 long4d,\n"
-    "                      ulong4 ulong4d,\n"
-    "                      float4 float4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector4_d(const char4 constchar4d,\n"
-    "                            const uchar4 constuchar4d,\n"
-    "                            const short4 constshort4d,\n"
-    "                            const ushort4 constushort4d,\n"
-    "                            const int4 constint4d,\n"
-    "                            const uint4 constuint4d,\n"
-    "                            const long4 constlong4d,\n"
-    "                            const ulong4 constulong4d,\n"
-    "                            const float4 constfloat4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector4_d(private char4 privatechar4d,\n"
-    "                              private uchar4 privateuchar4d,\n"
-    "                              private short4 privateshort4d,\n"
-    "                              private ushort4 privateushort4d,\n"
-    "                              private int4 privateint4d,\n"
-    "                              private uint4 privateuint4d,\n"
-    "                              private long4 privatelong4d,\n"
-    "                              private ulong4 privateulong4d,\n"
-    "                              private float4 privatefloat4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector4_d(private const char4 privateconstchar4d,\n"
-    "                                    private const uchar4 privateconstuchar4d,\n"
-    "                                    private const short4 privateconstshort4d,\n"
-    "                                    private const ushort4 privateconstushort4d,\n"
-    "                                    private const int4 privateconstint4d,\n"
-    "                                    private const uint4 privateconstuint4d,\n"
-    "                                    private const long4 privateconstlong4d,\n"
-    "                                    private const ulong4 privateconstulong4d,\n"
-    "                                    private const float4 privateconstfloat4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_p0(constant char8*constantchar8p,\n"
-    "                               constant uchar8 *constantuchar8p,\n"
-    "                               constant short8* constantshort8p,\n"
-    "                               constant ushort8 * constantushort8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_p1(constant int8*constantint8p,\n"
-    "                               constant uint8 *constantuint8p,\n"
-    "                               constant long8* constantlong8p,\n"
-    "                               constant ulong8 * constantulong8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_p2(constant float8*constantfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_restrict_p0(constant char8 *restrict constantchar8restrictp,\n"
-    "                                        constant uchar8* restrict constantuchar8restrictp,\n"
-    "                                        constant short8 * restrict constantshort8restrictp,\n"
-    "                                        constant ushort8*restrict constantushort8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_restrict_p1(constant int8 *restrict constantint8restrictp,\n"
-    "                                        constant uint8* restrict constantuint8restrictp,\n"
-    "                                        constant long8 * restrict constantlong8restrictp,\n"
-    "                                        constant ulong8*restrict constantulong8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector8_restrict_p2(constant float8 *restrict constantfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector8_p(global char8*globalchar8p,\n"
-    "                             global uchar8 *globaluchar8p,\n"
-    "                             global short8* globalshort8p,\n"
-    "                             global ushort8 * globalushort8p,\n"
-    "                             global int8*globalint8p,\n"
-    "                             global uint8 *globaluint8p,\n"
-    "                             global long8* globallong8p,\n"
-    "                             global ulong8 * globalulong8p,\n"
-    "                             global float8*globalfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector8_restrict_p(global char8 *restrict globalchar8restrictp,\n"
-    "                                      global uchar8* restrict globaluchar8restrictp,\n"
-    "                                      global short8 * restrict globalshort8restrictp,\n"
-    "                                      global ushort8*restrict globalushort8restrictp,\n"
-    "                                      global int8 *restrict globalint8restrictp,\n"
-    "                                      global uint8* restrict globaluint8restrictp,\n"
-    "                                      global long8 * restrict globallong8restrictp,\n"
-    "                                      global ulong8*restrict globalulong8restrictp,\n"
-    "                                      global float8 *restrict globalfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector8_p(global const char8* globalconstchar8p,\n"
-    "                                   global const uchar8 * globalconstuchar8p,\n"
-    "                                   global const short8*globalconstshort8p,\n"
-    "                                   global const ushort8 *globalconstushort8p,\n"
-    "                                   global const int8* globalconstint8p,\n"
-    "                                   global const uint8 * globalconstuint8p,\n"
-    "                                   global const long8*globalconstlong8p,\n"
-    "                                   global const ulong8 *globalconstulong8p,\n"
-    "                                   global const float8* globalconstfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector8_restrict_p(global const char8 * restrict globalconstchar8restrictp,\n"
-    "                                            global const uchar8*restrict globalconstuchar8restrictp,\n"
-    "                                            global const short8 *restrict globalconstshort8restrictp,\n"
-    "                                            global const ushort8* restrict globalconstushort8restrictp,\n"
-    "                                            global const int8 * restrict globalconstint8restrictp,\n"
-    "                                            global const uint8*restrict globalconstuint8restrictp,\n"
-    "                                            global const long8 *restrict globalconstlong8restrictp,\n"
-    "                                            global const ulong8* restrict globalconstulong8restrictp,\n"
-    "                                            global const float8 * restrict globalconstfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector8_p(global volatile char8*globalvolatilechar8p,\n"
-    "                                      global volatile uchar8 *globalvolatileuchar8p,\n"
-    "                                      global volatile short8* globalvolatileshort8p,\n"
-    "                                      global volatile ushort8 * globalvolatileushort8p,\n"
-    "                                      global volatile int8*globalvolatileint8p,\n"
-    "                                      global volatile uint8 *globalvolatileuint8p,\n"
-    "                                      global volatile long8* globalvolatilelong8p,\n"
-    "                                      global volatile ulong8 * globalvolatileulong8p,\n"
-    "                                      global volatile float8*globalvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector8_restrict_p(global volatile char8 *restrict globalvolatilechar8restrictp,\n"
-    "                                               global volatile uchar8* restrict globalvolatileuchar8restrictp,\n"
-    "                                               global volatile short8 * restrict globalvolatileshort8restrictp,\n"
-    "                                               global volatile ushort8*restrict globalvolatileushort8restrictp,\n"
-    "                                               global volatile int8 *restrict globalvolatileint8restrictp,\n"
-    "                                               global volatile uint8* restrict globalvolatileuint8restrictp,\n"
-    "                                               global volatile long8 * restrict globalvolatilelong8restrictp,\n"
-    "                                               global volatile ulong8*restrict globalvolatileulong8restrictp,\n"
-    "                                               global volatile float8 *restrict globalvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector8_p(global const volatile char8* globalconstvolatilechar8p,\n"
-    "                                            global const volatile uchar8 * globalconstvolatileuchar8p,\n"
-    "                                            global const volatile short8*globalconstvolatileshort8p,\n"
-    "                                            global const volatile ushort8 *globalconstvolatileushort8p,\n"
-    "                                            global const volatile int8* globalconstvolatileint8p,\n"
-    "                                            global const volatile uint8 * globalconstvolatileuint8p,\n"
-    "                                            global const volatile long8*globalconstvolatilelong8p,\n"
-    "                                            global const volatile ulong8 *globalconstvolatileulong8p,\n"
-    "                                            global const volatile float8* globalconstvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector8_restrict_p(global const volatile char8 * restrict globalconstvolatilechar8restrictp,\n"
-    "                                                     global const volatile uchar8*restrict globalconstvolatileuchar8restrictp,\n"
-    "                                                     global const volatile short8 *restrict globalconstvolatileshort8restrictp,\n"
-    "                                                     global const volatile ushort8* restrict globalconstvolatileushort8restrictp,\n"
-    "                                                     global const volatile int8 * restrict globalconstvolatileint8restrictp,\n"
-    "                                                     global const volatile uint8*restrict globalconstvolatileuint8restrictp,\n"
-    "                                                     global const volatile long8 *restrict globalconstvolatilelong8restrictp,\n"
-    "                                                     global const volatile ulong8* restrict globalconstvolatileulong8restrictp,\n"
-    "                                                     global const volatile float8 * restrict globalconstvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector8_p(local char8*localchar8p,\n"
-    "                            local uchar8 *localuchar8p,\n"
-    "                            local short8* localshort8p,\n"
-    "                            local ushort8 * localushort8p,\n"
-    "                            local int8*localint8p,\n"
-    "                            local uint8 *localuint8p,\n"
-    "                            local long8* locallong8p,\n"
-    "                            local ulong8 * localulong8p,\n"
-    "                            local float8*localfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector8_restrict_p(local char8 *restrict localchar8restrictp,\n"
-    "                                     local uchar8* restrict localuchar8restrictp,\n"
-    "                                     local short8 * restrict localshort8restrictp,\n"
-    "                                     local ushort8*restrict localushort8restrictp,\n"
-    "                                     local int8 *restrict localint8restrictp,\n"
-    "                                     local uint8* restrict localuint8restrictp,\n"
-    "                                     local long8 * restrict locallong8restrictp,\n"
-    "                                     local ulong8*restrict localulong8restrictp,\n"
-    "                                     local float8 *restrict localfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector8_p(local const char8* localconstchar8p,\n"
-    "                                  local const uchar8 * localconstuchar8p,\n"
-    "                                  local const short8*localconstshort8p,\n"
-    "                                  local const ushort8 *localconstushort8p,\n"
-    "                                  local const int8* localconstint8p,\n"
-    "                                  local const uint8 * localconstuint8p,\n"
-    "                                  local const long8*localconstlong8p,\n"
-    "                                  local const ulong8 *localconstulong8p,\n"
-    "                                  local const float8* localconstfloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector8_restrict_p(local const char8 * restrict localconstchar8restrictp,\n"
-    "                                           local const uchar8*restrict localconstuchar8restrictp,\n"
-    "                                           local const short8 *restrict localconstshort8restrictp,\n"
-    "                                           local const ushort8* restrict localconstushort8restrictp,\n"
-    "                                           local const int8 * restrict localconstint8restrictp,\n"
-    "                                           local const uint8*restrict localconstuint8restrictp,\n"
-    "                                           local const long8 *restrict localconstlong8restrictp,\n"
-    "                                           local const ulong8* restrict localconstulong8restrictp,\n"
-    "                                           local const float8 * restrict localconstfloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector8_p(local volatile char8*localvolatilechar8p,\n"
-    "                                     local volatile uchar8 *localvolatileuchar8p,\n"
-    "                                     local volatile short8* localvolatileshort8p,\n"
-    "                                     local volatile ushort8 * localvolatileushort8p,\n"
-    "                                     local volatile int8*localvolatileint8p,\n"
-    "                                     local volatile uint8 *localvolatileuint8p,\n"
-    "                                     local volatile long8* localvolatilelong8p,\n"
-    "                                     local volatile ulong8 * localvolatileulong8p,\n"
-    "                                     local volatile float8*localvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector8_restrict_p(local volatile char8 *restrict localvolatilechar8restrictp,\n"
-    "                                              local volatile uchar8* restrict localvolatileuchar8restrictp,\n"
-    "                                              local volatile short8 * restrict localvolatileshort8restrictp,\n"
-    "                                              local volatile ushort8*restrict localvolatileushort8restrictp,\n"
-    "                                              local volatile int8 *restrict localvolatileint8restrictp,\n"
-    "                                              local volatile uint8* restrict localvolatileuint8restrictp,\n"
-    "                                              local volatile long8 * restrict localvolatilelong8restrictp,\n"
-    "                                              local volatile ulong8*restrict localvolatileulong8restrictp,\n"
-    "                                              local volatile float8 *restrict localvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector8_p(local const volatile char8* localconstvolatilechar8p,\n"
-    "                                           local const volatile uchar8 * localconstvolatileuchar8p,\n"
-    "                                           local const volatile short8*localconstvolatileshort8p,\n"
-    "                                           local const volatile ushort8 *localconstvolatileushort8p,\n"
-    "                                           local const volatile int8* localconstvolatileint8p,\n"
-    "                                           local const volatile uint8 * localconstvolatileuint8p,\n"
-    "                                           local const volatile long8*localconstvolatilelong8p,\n"
-    "                                           local const volatile ulong8 *localconstvolatileulong8p,\n"
-    "                                           local const volatile float8* localconstvolatilefloat8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector8_restrict_p(local const volatile char8 * restrict localconstvolatilechar8restrictp,\n"
-    "                                                    local const volatile uchar8*restrict localconstvolatileuchar8restrictp,\n"
-    "                                                    local const volatile short8 *restrict localconstvolatileshort8restrictp,\n"
-    "                                                    local const volatile ushort8* restrict localconstvolatileushort8restrictp,\n"
-    "                                                    local const volatile int8 * restrict localconstvolatileint8restrictp,\n"
-    "                                                    local const volatile uint8*restrict localconstvolatileuint8restrictp,\n"
-    "                                                    local const volatile long8 *restrict localconstvolatilelong8restrictp,\n"
-    "                                                    local const volatile ulong8* restrict localconstvolatileulong8restrictp,\n"
-    "                                                    local const volatile float8 * restrict localconstvolatilefloat8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector8_d(char8 char8d,\n"
-    "                      uchar8 uchar8d,\n"
-    "                      short8 short8d,\n"
-    "                      ushort8 ushort8d,\n"
-    "                      int8 int8d,\n"
-    "                      uint8 uint8d,\n"
-    "                      long8 long8d,\n"
-    "                      ulong8 ulong8d,\n"
-    "                      float8 float8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector8_d(const char8 constchar8d,\n"
-    "                            const uchar8 constuchar8d,\n"
-    "                            const short8 constshort8d,\n"
-    "                            const ushort8 constushort8d,\n"
-    "                            const int8 constint8d,\n"
-    "                            const uint8 constuint8d,\n"
-    "                            const long8 constlong8d,\n"
-    "                            const ulong8 constulong8d,\n"
-    "                            const float8 constfloat8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector8_d(private char8 privatechar8d,\n"
-    "                              private uchar8 privateuchar8d,\n"
-    "                              private short8 privateshort8d,\n"
-    "                              private ushort8 privateushort8d,\n"
-    "                              private int8 privateint8d,\n"
-    "                              private uint8 privateuint8d,\n"
-    "                              private long8 privatelong8d,\n"
-    "                              private ulong8 privateulong8d,\n"
-    "                              private float8 privatefloat8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector8_d(private const char8 privateconstchar8d,\n"
-    "                                    private const uchar8 privateconstuchar8d,\n"
-    "                                    private const short8 privateconstshort8d,\n"
-    "                                    private const ushort8 privateconstushort8d,\n"
-    "                                    private const int8 privateconstint8d,\n"
-    "                                    private const uint8 privateconstuint8d,\n"
-    "                                    private const long8 privateconstlong8d,\n"
-    "                                    private const ulong8 privateconstulong8d,\n"
-    "                                    private const float8 privateconstfloat8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_p0(constant char16*constantchar16p,\n"
-    "                                constant uchar16 *constantuchar16p,\n"
-    "                                constant short16* constantshort16p,\n"
-    "                                constant ushort16 * constantushort16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_p1(constant int16*constantint16p,\n"
-    "                                constant uint16 *constantuint16p,\n"
-    "                                constant long16* constantlong16p,\n"
-    "                                constant ulong16 * constantulong16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_p2(constant float16*constantfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_restrict_p0(constant char16 *restrict constantchar16restrictp,\n"
-    "                                         constant uchar16* restrict constantuchar16restrictp,\n"
-    "                                         constant short16 * restrict constantshort16restrictp,\n"
-    "                                         constant ushort16*restrict constantushort16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_restrict_p1(constant int16 *restrict constantint16restrictp,\n"
-    "                                         constant uint16* restrict constantuint16restrictp,\n"
-    "                                         constant long16 * restrict constantlong16restrictp,\n"
-    "                                         constant ulong16*restrict constantulong16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_vector16_restrict_p2(constant float16 *restrict constantfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector16_p(global char16*globalchar16p,\n"
-    "                              global uchar16 *globaluchar16p,\n"
-    "                              global short16* globalshort16p,\n"
-    "                              global ushort16 * globalushort16p,\n"
-    "                              global int16*globalint16p,\n"
-    "                              global uint16 *globaluint16p,\n"
-    "                              global long16* globallong16p,\n"
-    "                              global ulong16 * globalulong16p,\n"
-    "                              global float16*globalfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_vector16_restrict_p(global char16 *restrict globalchar16restrictp,\n"
-    "                                       global uchar16* restrict globaluchar16restrictp,\n"
-    "                                       global short16 * restrict globalshort16restrictp,\n"
-    "                                       global ushort16*restrict globalushort16restrictp,\n"
-    "                                       global int16 *restrict globalint16restrictp,\n"
-    "                                       global uint16* restrict globaluint16restrictp,\n"
-    "                                       global long16 * restrict globallong16restrictp,\n"
-    "                                       global ulong16*restrict globalulong16restrictp,\n"
-    "                                       global float16 *restrict globalfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector16_p(global const char16* globalconstchar16p,\n"
-    "                                    global const uchar16 * globalconstuchar16p,\n"
-    "                                    global const short16*globalconstshort16p,\n"
-    "                                    global const ushort16 *globalconstushort16p,\n"
-    "                                    global const int16* globalconstint16p,\n"
-    "                                    global const uint16 * globalconstuint16p,\n"
-    "                                    global const long16*globalconstlong16p,\n"
-    "                                    global const ulong16 *globalconstulong16p,\n"
-    "                                    global const float16* globalconstfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_vector16_restrict_p(global const char16 * restrict globalconstchar16restrictp,\n"
-    "                                             global const uchar16*restrict globalconstuchar16restrictp,\n"
-    "                                             global const short16 *restrict globalconstshort16restrictp,\n"
-    "                                             global const ushort16* restrict globalconstushort16restrictp,\n"
-    "                                             global const int16 * restrict globalconstint16restrictp,\n"
-    "                                             global const uint16*restrict globalconstuint16restrictp,\n"
-    "                                             global const long16 *restrict globalconstlong16restrictp,\n"
-    "                                             global const ulong16* restrict globalconstulong16restrictp,\n"
-    "                                             global const float16 * restrict globalconstfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector16_p(global volatile char16*globalvolatilechar16p,\n"
-    "                                       global volatile uchar16 *globalvolatileuchar16p,\n"
-    "                                       global volatile short16* globalvolatileshort16p,\n"
-    "                                       global volatile ushort16 * globalvolatileushort16p,\n"
-    "                                       global volatile int16*globalvolatileint16p,\n"
-    "                                       global volatile uint16 *globalvolatileuint16p,\n"
-    "                                       global volatile long16* globalvolatilelong16p,\n"
-    "                                       global volatile ulong16 * globalvolatileulong16p,\n"
-    "                                       global volatile float16*globalvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_vector16_restrict_p(global volatile char16 *restrict globalvolatilechar16restrictp,\n"
-    "                                                global volatile uchar16* restrict globalvolatileuchar16restrictp,\n"
-    "                                                global volatile short16 * restrict globalvolatileshort16restrictp,\n"
-    "                                                global volatile ushort16*restrict globalvolatileushort16restrictp,\n"
-    "                                                global volatile int16 *restrict globalvolatileint16restrictp,\n"
-    "                                                global volatile uint16* restrict globalvolatileuint16restrictp,\n"
-    "                                                global volatile long16 * restrict globalvolatilelong16restrictp,\n"
-    "                                                global volatile ulong16*restrict globalvolatileulong16restrictp,\n"
-    "                                                global volatile float16 *restrict globalvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector16_p(global const volatile char16* globalconstvolatilechar16p,\n"
-    "                                             global const volatile uchar16 * globalconstvolatileuchar16p,\n"
-    "                                             global const volatile short16*globalconstvolatileshort16p,\n"
-    "                                             global const volatile ushort16 *globalconstvolatileushort16p,\n"
-    "                                             global const volatile int16* globalconstvolatileint16p,\n"
-    "                                             global const volatile uint16 * globalconstvolatileuint16p,\n"
-    "                                             global const volatile long16*globalconstvolatilelong16p,\n"
-    "                                             global const volatile ulong16 *globalconstvolatileulong16p,\n"
-    "                                             global const volatile float16* globalconstvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_vector16_restrict_p(global const volatile char16 * restrict globalconstvolatilechar16restrictp,\n"
-    "                                                      global const volatile uchar16*restrict globalconstvolatileuchar16restrictp,\n"
-    "                                                      global const volatile short16 *restrict globalconstvolatileshort16restrictp,\n"
-    "                                                      global const volatile ushort16* restrict globalconstvolatileushort16restrictp,\n"
-    "                                                      global const volatile int16 * restrict globalconstvolatileint16restrictp,\n"
-    "                                                      global const volatile uint16*restrict globalconstvolatileuint16restrictp,\n"
-    "                                                      global const volatile long16 *restrict globalconstvolatilelong16restrictp,\n"
-    "                                                      global const volatile ulong16* restrict globalconstvolatileulong16restrictp,\n"
-    "                                                      global const volatile float16 * restrict globalconstvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector16_p(local char16*localchar16p,\n"
-    "                             local uchar16 *localuchar16p,\n"
-    "                             local short16* localshort16p,\n"
-    "                             local ushort16 * localushort16p,\n"
-    "                             local int16*localint16p,\n"
-    "                             local uint16 *localuint16p,\n"
-    "                             local long16* locallong16p,\n"
-    "                             local ulong16 * localulong16p,\n"
-    "                             local float16*localfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_vector16_restrict_p(local char16 *restrict localchar16restrictp,\n"
-    "                                      local uchar16* restrict localuchar16restrictp,\n"
-    "                                      local short16 * restrict localshort16restrictp,\n"
-    "                                      local ushort16*restrict localushort16restrictp,\n"
-    "                                      local int16 *restrict localint16restrictp,\n"
-    "                                      local uint16* restrict localuint16restrictp,\n"
-    "                                      local long16 * restrict locallong16restrictp,\n"
-    "                                      local ulong16*restrict localulong16restrictp,\n"
-    "                                      local float16 *restrict localfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector16_p(local const char16* localconstchar16p,\n"
-    "                                   local const uchar16 * localconstuchar16p,\n"
-    "                                   local const short16*localconstshort16p,\n"
-    "                                   local const ushort16 *localconstushort16p,\n"
-    "                                   local const int16* localconstint16p,\n"
-    "                                   local const uint16 * localconstuint16p,\n"
-    "                                   local const long16*localconstlong16p,\n"
-    "                                   local const ulong16 *localconstulong16p,\n"
-    "                                   local const float16* localconstfloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_vector16_restrict_p(local const char16 * restrict localconstchar16restrictp,\n"
-    "                                            local const uchar16*restrict localconstuchar16restrictp,\n"
-    "                                            local const short16 *restrict localconstshort16restrictp,\n"
-    "                                            local const ushort16* restrict localconstushort16restrictp,\n"
-    "                                            local const int16 * restrict localconstint16restrictp,\n"
-    "                                            local const uint16*restrict localconstuint16restrictp,\n"
-    "                                            local const long16 *restrict localconstlong16restrictp,\n"
-    "                                            local const ulong16* restrict localconstulong16restrictp,\n"
-    "                                            local const float16 * restrict localconstfloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector16_p(local volatile char16*localvolatilechar16p,\n"
-    "                                      local volatile uchar16 *localvolatileuchar16p,\n"
-    "                                      local volatile short16* localvolatileshort16p,\n"
-    "                                      local volatile ushort16 * localvolatileushort16p,\n"
-    "                                      local volatile int16*localvolatileint16p,\n"
-    "                                      local volatile uint16 *localvolatileuint16p,\n"
-    "                                      local volatile long16* localvolatilelong16p,\n"
-    "                                      local volatile ulong16 * localvolatileulong16p,\n"
-    "                                      local volatile float16*localvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_vector16_restrict_p(local volatile char16 *restrict localvolatilechar16restrictp,\n"
-    "                                               local volatile uchar16* restrict localvolatileuchar16restrictp,\n"
-    "                                               local volatile short16 * restrict localvolatileshort16restrictp,\n"
-    "                                               local volatile ushort16*restrict localvolatileushort16restrictp,\n"
-    "                                               local volatile int16 *restrict localvolatileint16restrictp,\n"
-    "                                               local volatile uint16* restrict localvolatileuint16restrictp,\n"
-    "                                               local volatile long16 * restrict localvolatilelong16restrictp,\n"
-    "                                               local volatile ulong16*restrict localvolatileulong16restrictp,\n"
-    "                                               local volatile float16 *restrict localvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector16_p(local const volatile char16* localconstvolatilechar16p,\n"
-    "                                            local const volatile uchar16 * localconstvolatileuchar16p,\n"
-    "                                            local const volatile short16*localconstvolatileshort16p,\n"
-    "                                            local const volatile ushort16 *localconstvolatileushort16p,\n"
-    "                                            local const volatile int16* localconstvolatileint16p,\n"
-    "                                            local const volatile uint16 * localconstvolatileuint16p,\n"
-    "                                            local const volatile long16*localconstvolatilelong16p,\n"
-    "                                            local const volatile ulong16 *localconstvolatileulong16p,\n"
-    "                                            local const volatile float16* localconstvolatilefloat16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_vector16_restrict_p(local const volatile char16 * restrict localconstvolatilechar16restrictp,\n"
-    "                                                     local const volatile uchar16*restrict localconstvolatileuchar16restrictp,\n"
-    "                                                     local const volatile short16 *restrict localconstvolatileshort16restrictp,\n"
-    "                                                     local const volatile ushort16* restrict localconstvolatileushort16restrictp,\n"
-    "                                                     local const volatile int16 * restrict localconstvolatileint16restrictp,\n"
-    "                                                     local const volatile uint16*restrict localconstvolatileuint16restrictp,\n"
-    "                                                     local const volatile long16 *restrict localconstvolatilelong16restrictp,\n"
-    "                                                     local const volatile ulong16* restrict localconstvolatileulong16restrictp,\n"
-    "                                                     local const volatile float16 * restrict localconstvolatilefloat16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void vector16_d(char16 char16d,\n"
-    "                       uchar16 uchar16d,\n"
-    "                       short16 short16d,\n"
-    "                       ushort16 ushort16d,\n"
-    "                       int16 int16d,\n"
-    "                       uint16 uint16d,\n"
-    "                       long16 long16d,\n"
-    "                       ulong16 ulong16d,\n"
-    "                       float16 float16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_vector16_d(const char16 constchar16d,\n"
-    "                             const uchar16 constuchar16d,\n"
-    "                             const short16 constshort16d,\n"
-    "                             const ushort16 constushort16d,\n"
-    "                             const int16 constint16d,\n"
-    "                             const uint16 constuint16d,\n"
-    "                             const long16 constlong16d,\n"
-    "                             const ulong16 constulong16d,\n"
-    "                             const float16 constfloat16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_vector16_d(private char16 privatechar16d,\n"
-    "                               private uchar16 privateuchar16d,\n"
-    "                               private short16 privateshort16d,\n"
-    "                               private ushort16 privateushort16d,\n"
-    "                               private int16 privateint16d,\n"
-    "                               private uint16 privateuint16d,\n"
-    "                               private long16 privatelong16d,\n"
-    "                               private ulong16 privateulong16d,\n"
-    "                               private float16 privatefloat16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_vector16_d(private const char16 privateconstchar16d,\n"
-    "                                     private const uchar16 privateconstuchar16d,\n"
-    "                                     private const short16 privateconstshort16d,\n"
-    "                                     private const ushort16 privateconstushort16d,\n"
-    "                                     private const int16 privateconstint16d,\n"
-    "                                     private const uint16 privateconstuint16d,\n"
-    "                                     private const long16 privateconstlong16d,\n"
-    "                                     private const ulong16 privateconstulong16d,\n"
-    "                                     private const float16 privateconstfloat16d)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_p0(constant typedef_type*constanttypedef_typep,\n"
-    "                               constant struct struct_type *constantstructstruct_typep,\n"
-    "                               constant typedef_struct_type* constanttypedef_struct_typep,\n"
-    "                               constant union union_type * constantunionunion_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_p1(constant typedef_union_type*constanttypedef_union_typep,\n"
-    "                               constant enum enum_type *constantenumenum_typep,\n"
-    "                               constant typedef_enum_type* constanttypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_restrict_p0(constant typedef_type * restrict constanttypedef_typerestrictp,\n"
-    "                                        constant struct struct_type*restrict constantstructstruct_typerestrictp,\n"
-    "                                        constant typedef_struct_type *restrict constanttypedef_struct_typerestrictp,\n"
-    "                                        constant union union_type* restrict constantunionunion_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void constant_derived_restrict_p1(constant typedef_union_type * restrict constanttypedef_union_typerestrictp,\n"
-    "                                        constant enum enum_type*restrict constantenumenum_typerestrictp,\n"
-    "                                        constant typedef_enum_type *restrict constanttypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_derived_p(global typedef_type*globaltypedef_typep,\n"
-    "                             global struct struct_type *globalstructstruct_typep,\n"
-    "                             global typedef_struct_type* globaltypedef_struct_typep,\n"
-    "                             global union union_type * globalunionunion_typep,\n"
-    "                             global typedef_union_type*globaltypedef_union_typep,\n"
-    "                             global enum enum_type *globalenumenum_typep,\n"
-    "                             global typedef_enum_type* globaltypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_derived_restrict_p(global typedef_type * restrict globaltypedef_typerestrictp,\n"
-    "                                      global struct struct_type*restrict globalstructstruct_typerestrictp,\n"
-    "                                      global typedef_struct_type *restrict globaltypedef_struct_typerestrictp,\n"
-    "                                      global union union_type* restrict globalunionunion_typerestrictp,\n"
-    "                                      global typedef_union_type * restrict globaltypedef_union_typerestrictp,\n"
-    "                                      global enum enum_type*restrict globalenumenum_typerestrictp,\n"
-    "                                      global typedef_enum_type *restrict globaltypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_derived_p(global const typedef_type* globalconsttypedef_typep,\n"
-    "                                   global const struct struct_type * globalconststructstruct_typep,\n"
-    "                                   global const typedef_struct_type*globalconsttypedef_struct_typep,\n"
-    "                                   global const union union_type *globalconstunionunion_typep,\n"
-    "                                   global const typedef_union_type* globalconsttypedef_union_typep,\n"
-    "                                   global const enum enum_type * globalconstenumenum_typep,\n"
-    "                                   global const typedef_enum_type*globalconsttypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_derived_restrict_p(global const typedef_type *restrict globalconsttypedef_typerestrictp,\n"
-    "                                            global const struct struct_type* restrict globalconststructstruct_typerestrictp,\n"
-    "                                            global const typedef_struct_type * restrict globalconsttypedef_struct_typerestrictp,\n"
-    "                                            global const union union_type*restrict globalconstunionunion_typerestrictp,\n"
-    "                                            global const typedef_union_type *restrict globalconsttypedef_union_typerestrictp,\n"
-    "                                            global const enum enum_type* restrict globalconstenumenum_typerestrictp,\n"
-    "                                            global const typedef_enum_type * restrict globalconsttypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_derived_p(global volatile typedef_type*globalvolatiletypedef_typep,\n"
-    "                                      global volatile struct struct_type *globalvolatilestructstruct_typep,\n"
-    "                                      global volatile typedef_struct_type* globalvolatiletypedef_struct_typep,\n"
-    "                                      global volatile union union_type * globalvolatileunionunion_typep,\n"
-    "                                      global volatile typedef_union_type*globalvolatiletypedef_union_typep,\n"
-    "                                      global volatile enum enum_type *globalvolatileenumenum_typep,\n"
-    "                                      global volatile typedef_enum_type* globalvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_volatile_derived_restrict_p(global volatile typedef_type * restrict globalvolatiletypedef_typerestrictp,\n"
-    "                                               global volatile struct struct_type*restrict globalvolatilestructstruct_typerestrictp,\n"
-    "                                               global volatile typedef_struct_type *restrict globalvolatiletypedef_struct_typerestrictp,\n"
-    "                                               global volatile union union_type* restrict globalvolatileunionunion_typerestrictp,\n"
-    "                                               global volatile typedef_union_type * restrict globalvolatiletypedef_union_typerestrictp,\n"
-    "                                               global volatile enum enum_type*restrict globalvolatileenumenum_typerestrictp,\n"
-    "                                               global volatile typedef_enum_type *restrict globalvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_derived_p(global const volatile typedef_type* globalconstvolatiletypedef_typep,\n"
-    "                                            global const volatile struct struct_type * globalconstvolatilestructstruct_typep,\n"
-    "                                            global const volatile typedef_struct_type*globalconstvolatiletypedef_struct_typep,\n"
-    "                                            global const volatile union union_type *globalconstvolatileunionunion_typep,\n"
-    "                                            global const volatile typedef_union_type* globalconstvolatiletypedef_union_typep,\n"
-    "                                            global const volatile enum enum_type * globalconstvolatileenumenum_typep,\n"
-    "                                            global const volatile typedef_enum_type*globalconstvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void global_const_volatile_derived_restrict_p(global const volatile typedef_type *restrict globalconstvolatiletypedef_typerestrictp,\n"
-    "                                                     global const volatile struct struct_type* restrict globalconstvolatilestructstruct_typerestrictp,\n"
-    "                                                     global const volatile typedef_struct_type * restrict globalconstvolatiletypedef_struct_typerestrictp,\n"
-    "                                                     global const volatile union union_type*restrict globalconstvolatileunionunion_typerestrictp,\n"
-    "                                                     global const volatile typedef_union_type *restrict globalconstvolatiletypedef_union_typerestrictp,\n"
-    "                                                     global const volatile enum enum_type* restrict globalconstvolatileenumenum_typerestrictp,\n"
-    "                                                     global const volatile typedef_enum_type * restrict globalconstvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_derived_p(local typedef_type*localtypedef_typep,\n"
-    "                            local struct struct_type *localstructstruct_typep,\n"
-    "                            local typedef_struct_type* localtypedef_struct_typep,\n"
-    "                            local union union_type * localunionunion_typep,\n"
-    "                            local typedef_union_type*localtypedef_union_typep,\n"
-    "                            local enum enum_type *localenumenum_typep,\n"
-    "                            local typedef_enum_type* localtypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_derived_restrict_p(local typedef_type * restrict localtypedef_typerestrictp,\n"
-    "                                     local struct struct_type*restrict localstructstruct_typerestrictp,\n"
-    "                                     local typedef_struct_type *restrict localtypedef_struct_typerestrictp,\n"
-    "                                     local union union_type* restrict localunionunion_typerestrictp,\n"
-    "                                     local typedef_union_type * restrict localtypedef_union_typerestrictp,\n"
-    "                                     local enum enum_type*restrict localenumenum_typerestrictp,\n"
-    "                                     local typedef_enum_type *restrict localtypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_derived_p(local const typedef_type* localconsttypedef_typep,\n"
-    "                                  local const struct struct_type * localconststructstruct_typep,\n"
-    "                                  local const typedef_struct_type*localconsttypedef_struct_typep,\n"
-    "                                  local const union union_type *localconstunionunion_typep,\n"
-    "                                  local const typedef_union_type* localconsttypedef_union_typep,\n"
-    "                                  local const enum enum_type * localconstenumenum_typep,\n"
-    "                                  local const typedef_enum_type*localconsttypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_derived_restrict_p(local const typedef_type *restrict localconsttypedef_typerestrictp,\n"
-    "                                           local const struct struct_type* restrict localconststructstruct_typerestrictp,\n"
-    "                                           local const typedef_struct_type * restrict localconsttypedef_struct_typerestrictp,\n"
-    "                                           local const union union_type*restrict localconstunionunion_typerestrictp,\n"
-    "                                           local const typedef_union_type *restrict localconsttypedef_union_typerestrictp,\n"
-    "                                           local const enum enum_type* restrict localconstenumenum_typerestrictp,\n"
-    "                                           local const typedef_enum_type * restrict localconsttypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_derived_p(local volatile typedef_type*localvolatiletypedef_typep,\n"
-    "                                     local volatile struct struct_type *localvolatilestructstruct_typep,\n"
-    "                                     local volatile typedef_struct_type* localvolatiletypedef_struct_typep,\n"
-    "                                     local volatile union union_type * localvolatileunionunion_typep,\n"
-    "                                     local volatile typedef_union_type*localvolatiletypedef_union_typep,\n"
-    "                                     local volatile enum enum_type *localvolatileenumenum_typep,\n"
-    "                                     local volatile typedef_enum_type* localvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_volatile_derived_restrict_p(local volatile typedef_type * restrict localvolatiletypedef_typerestrictp,\n"
-    "                                              local volatile struct struct_type*restrict localvolatilestructstruct_typerestrictp,\n"
-    "                                              local volatile typedef_struct_type *restrict localvolatiletypedef_struct_typerestrictp,\n"
-    "                                              local volatile union union_type* restrict localvolatileunionunion_typerestrictp,\n"
-    "                                              local volatile typedef_union_type * restrict localvolatiletypedef_union_typerestrictp,\n"
-    "                                              local volatile enum enum_type*restrict localvolatileenumenum_typerestrictp,\n"
-    "                                              local volatile typedef_enum_type *restrict localvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_derived_p(local const volatile typedef_type* localconstvolatiletypedef_typep,\n"
-    "                                           local const volatile struct struct_type * localconstvolatilestructstruct_typep,\n"
-    "                                           local const volatile typedef_struct_type*localconstvolatiletypedef_struct_typep,\n"
-    "                                           local const volatile union union_type *localconstvolatileunionunion_typep,\n"
-    "                                           local const volatile typedef_union_type* localconstvolatiletypedef_union_typep,\n"
-    "                                           local const volatile enum enum_type * localconstvolatileenumenum_typep,\n"
-    "                                           local const volatile typedef_enum_type*localconstvolatiletypedef_enum_typep)\n"
-    "{}\n",
-    "\n"
-    "kernel void local_const_volatile_derived_restrict_p(local const volatile typedef_type *restrict localconstvolatiletypedef_typerestrictp,\n"
-    "                                                    local const volatile struct struct_type* restrict localconstvolatilestructstruct_typerestrictp,\n"
-    "                                                    local const volatile typedef_struct_type * restrict localconstvolatiletypedef_struct_typerestrictp,\n"
-    "                                                    local const volatile union union_type*restrict localconstvolatileunionunion_typerestrictp,\n"
-    "                                                    local const volatile typedef_union_type *restrict localconstvolatiletypedef_union_typerestrictp,\n"
-    "                                                    local const volatile enum enum_type* restrict localconstvolatileenumenum_typerestrictp,\n"
-    "                                                    local const volatile typedef_enum_type * restrict localconstvolatiletypedef_enum_typerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void derived_d(typedef_type typedef_typed,\n"
-    "                      struct struct_type structstruct_typed,\n"
-    "                      typedef_struct_type typedef_struct_typed,\n"
-    "                      union union_type unionunion_typed,\n"
-    "                      typedef_union_type typedef_union_typed,\n"
-    "                      enum enum_type enumenum_typed,\n"
-    "                      typedef_enum_type typedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-    "kernel void const_derived_d(const typedef_type consttypedef_typed,\n"
-    "                            const struct struct_type conststructstruct_typed,\n"
-    "                            const typedef_struct_type consttypedef_struct_typed,\n"
-    "                            const union union_type constunionunion_typed,\n"
-    "                            const typedef_union_type consttypedef_union_typed,\n"
-    "                            const enum enum_type constenumenum_typed,\n"
-    "                            const typedef_enum_type consttypedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_derived_d(private typedef_type privatetypedef_typed,\n"
-    "                              private struct struct_type privatestructstruct_typed,\n"
-    "                              private typedef_struct_type privatetypedef_struct_typed,\n"
-    "                              private union union_type privateunionunion_typed,\n"
-    "                              private typedef_union_type privatetypedef_union_typed,\n"
-    "                              private enum enum_type privateenumenum_typed,\n"
-    "                              private typedef_enum_type privatetypedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-    "kernel void private_const_derived_d(private const typedef_type privateconsttypedef_typed,\n"
-    "                                    private const struct struct_type privateconststructstruct_typed,\n"
-    "                                    private const typedef_struct_type privateconsttypedef_struct_typed,\n"
-    "                                    private const union union_type privateconstunionunion_typed,\n"
-    "                                    private const typedef_union_type privateconsttypedef_union_typed,\n"
-    "                                    private const enum enum_type privateconstenumenum_typed,\n"
-    "                                    private const typedef_enum_type privateconsttypedef_enum_typed)\n"
-    "{}\n",
-    "\n"
-};
-
-static const char * required_arg_info[][72] = {
-  // The minimum value of CL_DEVICE_MAX_CONSTANT_ARGS is 4
-    {
-        "constant_scalar_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "void*", "constantvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char*", "constantcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "constantucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "constantunsignedcharp",
-    NULL
-  },
-  {
-    "constant_scalar_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short*", "constantshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "constantushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "constantunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int*", "constantintp",
-    NULL
-  },
-  {
-    "constant_scalar_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "constantuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "constantunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long*", "constantlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "constantulongp",
-    NULL
-  },
-  {
-    "constant_scalar_p3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "constantunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float*", "constantfloatp",
-        NULL
-    },
-    {
-        "constant_scalar_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "constantvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "constantcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "constantucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "constantunsignedcharrestrictp",
-    NULL
-  },
-  {
-    "constant_scalar_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "constantshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "constantushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "constantunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "constantintrestrictp",
-    NULL
-  },
-  {
-    "constant_scalar_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "constantuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "constantunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "constantlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "constantulongrestrictp",
-    NULL
-  },
-  {
-    "constant_scalar_restrict_p3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "constantunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "constantfloatrestrictp",
-        NULL
-    },
-    {
-        "global_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "void*", "globalvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char*", "globalcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "globalucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "globalunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short*", "globalshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "globalushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "globalunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int*", "globalintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "globaluintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "globalunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long*", "globallongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "globalulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "globalunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float*", "globalfloatp",
-        NULL
-    },
-    {
-        "global_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globaluintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globallongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalfloatrestrictp",
-        NULL
-    },
-    {
-        "global_const_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "void*", "globalconstvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char*", "globalconstcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "globalconstucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "globalconstunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short*", "globalconstshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "globalconstushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "globalconstunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int*", "globalconstintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "globalconstuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "globalconstunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long*", "globalconstlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "globalconstulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "globalconstunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float*", "globalconstfloatp",
-        NULL
-    },
-    {
-        "global_const_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalconstvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalconstcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalconstshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalconstintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globalconstlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalconstfloatrestrictp",
-        NULL
-    },
-    {
-        "global_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "globalvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "globalvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "globalvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "globalvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "globalvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalvolatileunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "globalvolatilefloatp",
-        NULL
-    },
-    {
-        "global_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globalvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalvolatileunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "globalconstvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "globalconstvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalconstvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "globalconstvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "globalconstvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalconstvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "globalconstvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "globalconstvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalconstvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "globalconstvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "globalconstvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalconstvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "globalconstvolatileunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "globalconstvolatilefloatp",
-        NULL
-    },
-    {
-        "global_const_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "globalconstvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "globalconstvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "globalconstvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "globalconstvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "globalconstvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "globalconstvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "globalconstvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "globalconstvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "globalconstvolatileunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "globalconstvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "local_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "void*", "localvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char*", "localcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "localucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar*", "localunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short*", "localshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "localushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort*", "localunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int*", "localintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "localuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint*", "localunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long*", "locallongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "localulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong*", "localunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float*", "localfloatp",
-        NULL
-    },
-    {
-        "local_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "locallongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localfloatrestrictp",
-        NULL
-    },
-    {
-        "local_const_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "void*", "localconstvoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char*", "localconstcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "localconstucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar*", "localconstunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short*", "localconstshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "localconstushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort*", "localconstunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int*", "localconstintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "localconstuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint*", "localconstunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long*", "localconstlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "localconstulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong*", "localconstunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float*", "localconstfloatp",
-        NULL
-    },
-    {
-        "local_const_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localconstvoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localconstcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localconstshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localconstintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "localconstlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localconstfloatrestrictp",
-        NULL
-    },
-    {
-        "local_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "localvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "localvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "localvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "localvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "localvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localvolatileunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "localvolatilefloatp",
-        NULL
-    },
-    {
-        "local_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "localvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localvolatileunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "void*", "localconstvolatilevoidp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char*", "localconstvolatilecharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localconstvolatileucharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar*", "localconstvolatileunsignedcharp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short*", "localconstvolatileshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localconstvolatileushortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort*", "localconstvolatileunsignedshortp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int*", "localconstvolatileintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localconstvolatileuintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint*", "localconstvolatileunsignedintp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long*", "localconstvolatilelongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localconstvolatileulongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong*", "localconstvolatileunsignedlongp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float*", "localconstvolatilefloatp",
-        NULL
-    },
-    {
-        "local_const_volatile_scalar_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "void*", "localconstvolatilevoidrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char*", "localconstvolatilecharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstvolatileucharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar*", "localconstvolatileunsignedcharrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short*", "localconstvolatileshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstvolatileushortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort*", "localconstvolatileunsignedshortrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int*", "localconstvolatileintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstvolatileuintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint*", "localconstvolatileunsignedintrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long*", "localconstvolatilelongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstvolatileulongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong*", "localconstvolatileunsignedlongrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float*", "localconstvolatilefloatrestrictp",
-        NULL
-    },
-    {
-        "scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "chard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "uchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "unsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "shortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "ushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "unsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "intd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "uintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "unsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "longd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "ulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "unsignedlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "floatd",
-        NULL
-    },
-    {
-        "const_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "constchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "constuchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "constunsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "constshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "constushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "constunsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "constintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "constuintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "constunsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "constlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "constulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "constunsignedlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "constfloatd",
-        NULL
-    },
-    {
-        "private_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "privatechard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateuchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateunsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "privateshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateunsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "privateintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateuintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateunsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "privatelongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateunsignedlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "privatefloatd",
-        NULL
-    },
-    {
-        "private_const_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char", "privateconstchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateconstuchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar", "privateconstunsignedchard",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short", "privateconstshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateconstushortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort", "privateconstunsignedshortd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int", "privateconstintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateconstuintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint", "privateconstunsignedintd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long", "privateconstlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateconstulongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong", "privateconstunsignedlongd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float", "privateconstfloatd",
-        NULL
-    },
-    {
-        "constant_vector2_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char2*", "constantchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar2*", "constantuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short2*", "constantshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort2*", "constantushort2p",
-    NULL
-    },
-    {
-        "constant_vector2_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int2*", "constantint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint2*", "constantuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long2*", "constantlong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong2*", "constantulong2p",
-    NULL
-    },
-    {
-        "constant_vector2_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float2*", "constantfloat2p",
-        NULL
-    },
-    {
-        "constant_vector2_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "constantchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "constantuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "constantshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "constantushort2restrictp",
-    NULL
-    },
-    {
-        "constant_vector2_restrict_p1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "constantint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "constantuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "constantlong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "constantulong2restrictp",
-    NULL
-    },
-    {
-        "constant_vector2_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "constantfloat2restrictp",
-        NULL
-    },
-    {
-        "global_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2*", "globalchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2*", "globaluchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2*", "globalshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2*", "globalushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2*", "globalint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2*", "globaluint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2*", "globallong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2*", "globalulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2*", "globalfloat2p",
-        NULL
-    },
-    {
-        "global_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globaluchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globaluint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globallong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalfloat2restrictp",
-        NULL
-    },
-    {
-        "global_const_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char2*", "globalconstchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar2*", "globalconstuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short2*", "globalconstshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort2*", "globalconstushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int2*", "globalconstint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint2*", "globalconstuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long2*", "globalconstlong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong2*", "globalconstulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float2*", "globalconstfloat2p",
-        NULL
-    },
-    {
-        "global_const_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalconstchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globalconstuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalconstshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalconstushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalconstint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globalconstuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globalconstlong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalconstulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalconstfloat2restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "globalvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "globalvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "globalvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "globalvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "globalvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "globalvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "globalvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "globalvolatileulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "globalvolatilefloat2p",
-        NULL
-    },
-    {
-        "global_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globalvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globalvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globalvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalvolatileulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "globalconstvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "globalconstvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "globalconstvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "globalconstvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "globalconstvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "globalconstvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "globalconstvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "globalconstvolatileulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "globalconstvolatilefloat2p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "globalconstvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "globalconstvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "globalconstvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "globalconstvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "globalconstvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "globalconstvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "globalconstvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "globalconstvolatileulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "globalconstvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "local_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2*", "localchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2*", "localuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2*", "localshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2*", "localushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2*", "localint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2*", "localuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2*", "locallong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2*", "localulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2*", "localfloat2p",
-        NULL
-    },
-    {
-        "local_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "locallong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localfloat2restrictp",
-        NULL
-    },
-    {
-        "local_const_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char2*", "localconstchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar2*", "localconstuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short2*", "localconstshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort2*", "localconstushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int2*", "localconstint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint2*", "localconstuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long2*", "localconstlong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong2*", "localconstulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float2*", "localconstfloat2p",
-        NULL
-    },
-    {
-        "local_const_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localconstchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localconstuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localconstshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localconstushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localconstint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localconstuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "localconstlong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localconstulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localconstfloat2restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "localvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "localvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "localvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "localvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "localvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "localvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "localvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "localvolatileulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "localvolatilefloat2p",
-        NULL
-    },
-    {
-        "local_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "localvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localvolatileulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char2*", "localconstvolatilechar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar2*", "localconstvolatileuchar2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short2*", "localconstvolatileshort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort2*", "localconstvolatileushort2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int2*", "localconstvolatileint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint2*", "localconstvolatileuint2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long2*", "localconstvolatilelong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong2*", "localconstvolatileulong2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float2*", "localconstvolatilefloat2p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector2_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char2*", "localconstvolatilechar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar2*", "localconstvolatileuchar2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short2*", "localconstvolatileshort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort2*", "localconstvolatileushort2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int2*", "localconstvolatileint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint2*", "localconstvolatileuint2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long2*", "localconstvolatilelong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong2*", "localconstvolatileulong2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float2*", "localconstvolatilefloat2restrictp",
-        NULL
-    },
-    {
-        "vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "char2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "uchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "short2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "ushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "int2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "uint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "long2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "ulong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "float2d",
-        NULL
-    },
-    {
-        "const_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "constchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "constuchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "constshort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "constushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "constint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "constuint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "constlong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "constulong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "constfloat2d",
-        NULL
-    },
-    {
-        "private_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "privatechar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "privateuchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "privateshort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "privateushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "privateint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "privateuint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "privatelong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "privateulong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "privatefloat2d",
-        NULL
-    },
-    {
-        "private_const_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char2", "privateconstchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar2", "privateconstuchar2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short2", "privateconstshort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort2", "privateconstushort2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int2", "privateconstint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint2", "privateconstuint2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long2", "privateconstlong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong2", "privateconstulong2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float2", "privateconstfloat2d",
-        NULL
-    },
-    {
-        "constant_vector3_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char3*", "constantchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar3*", "constantuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short3*", "constantshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort3*", "constantushort3p",
-        NULL
-    },
-    {
-        "constant_vector3_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int3*", "constantint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint3*", "constantuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long3*", "constantlong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong3*", "constantulong3p",
-    NULL
-    },
-    {
-        "constant_vector3_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float3*", "constantfloat3p",
-        NULL
-    },
-    {
-        "constant_vector3_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "constantchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "constantuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "constantshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "constantushort3restrictp",
-        NULL
-    },
-    {
-        "constant_vector3_restrict_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "constantint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "constantuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "constantlong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "constantulong3restrictp",
-    NULL
-    },
-    {
-        "constant_vector3_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "constantfloat3restrictp",
-        NULL
-    },
-    {
-        "global_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3*", "globalchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3*", "globaluchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3*", "globalshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3*", "globalushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3*", "globalint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3*", "globaluint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3*", "globallong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3*", "globalulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3*", "globalfloat3p",
-        NULL
-    },
-    {
-        "global_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globaluchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globaluint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globallong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalfloat3restrictp",
-        NULL
-    },
-    {
-        "global_const_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char3*", "globalconstchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar3*", "globalconstuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short3*", "globalconstshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort3*", "globalconstushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int3*", "globalconstint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint3*", "globalconstuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long3*", "globalconstlong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong3*", "globalconstulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float3*", "globalconstfloat3p",
-        NULL
-    },
-    {
-        "global_const_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalconstchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globalconstuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalconstshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalconstushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalconstint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globalconstuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globalconstlong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalconstulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalconstfloat3restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "globalvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "globalvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "globalvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "globalvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "globalvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "globalvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "globalvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "globalvolatileulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "globalvolatilefloat3p",
-        NULL
-    },
-    {
-        "global_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globalvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globalvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globalvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalvolatileulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "globalconstvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "globalconstvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "globalconstvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "globalconstvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "globalconstvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "globalconstvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "globalconstvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "globalconstvolatileulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "globalconstvolatilefloat3p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "globalconstvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "globalconstvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "globalconstvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "globalconstvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "globalconstvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "globalconstvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "globalconstvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "globalconstvolatileulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "globalconstvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "local_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3*", "localchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3*", "localuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3*", "localshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3*", "localushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3*", "localint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3*", "localuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3*", "locallong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3*", "localulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3*", "localfloat3p",
-        NULL
-    },
-    {
-        "local_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "locallong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localfloat3restrictp",
-        NULL
-    },
-    {
-        "local_const_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char3*", "localconstchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar3*", "localconstuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short3*", "localconstshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort3*", "localconstushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int3*", "localconstint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint3*", "localconstuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long3*", "localconstlong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong3*", "localconstulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float3*", "localconstfloat3p",
-        NULL
-    },
-    {
-        "local_const_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localconstchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localconstuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localconstshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localconstushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localconstint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localconstuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "localconstlong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localconstulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localconstfloat3restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "localvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "localvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "localvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "localvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "localvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "localvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "localvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "localvolatileulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "localvolatilefloat3p",
-        NULL
-    },
-    {
-        "local_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "localvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localvolatileulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char3*", "localconstvolatilechar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar3*", "localconstvolatileuchar3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short3*", "localconstvolatileshort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort3*", "localconstvolatileushort3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int3*", "localconstvolatileint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint3*", "localconstvolatileuint3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long3*", "localconstvolatilelong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong3*", "localconstvolatileulong3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float3*", "localconstvolatilefloat3p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector3_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char3*", "localconstvolatilechar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar3*", "localconstvolatileuchar3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short3*", "localconstvolatileshort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort3*", "localconstvolatileushort3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int3*", "localconstvolatileint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint3*", "localconstvolatileuint3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long3*", "localconstvolatilelong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong3*", "localconstvolatileulong3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float3*", "localconstvolatilefloat3restrictp",
-        NULL
-    },
-    {
-        "vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "char3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "uchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "short3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "ushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "int3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "uint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "long3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "ulong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "float3d",
-        NULL
-    },
-    {
-        "const_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "constchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "constuchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "constshort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "constushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "constint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "constuint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "constlong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "constulong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "constfloat3d",
-        NULL
-    },
-    {
-        "private_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "privatechar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "privateuchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "privateshort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "privateushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "privateint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "privateuint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "privatelong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "privateulong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "privatefloat3d",
-        NULL
-    },
-    {
-        "private_const_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char3", "privateconstchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar3", "privateconstuchar3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short3", "privateconstshort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort3", "privateconstushort3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int3", "privateconstint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint3", "privateconstuint3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long3", "privateconstlong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong3", "privateconstulong3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float3", "privateconstfloat3d",
-        NULL
-    },
-    {
-        "constant_vector4_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char4*", "constantchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar4*", "constantuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short4*", "constantshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort4*", "constantushort4p",
-        NULL
-    },
-    {
-        "constant_vector4_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int4*", "constantint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint4*", "constantuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long4*", "constantlong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong4*", "constantulong4p",
-        NULL
-    },
-    {
-        "constant_vector4_p2",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float4*", "constantfloat4p",
-        NULL
-    },
-    {
-        "constant_vector4_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "constantchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "constantuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "constantshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "constantushort4restrictp",
-        NULL
-    },
-    {
-        "constant_vector4_restrict_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "constantint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "constantuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "constantlong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "constantulong4restrictp",
-        NULL
-    },
-    {
-        "constant_vector4_restrict_p2",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "constantfloat4restrictp",
-        NULL
-    },
-    {
-        "global_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4*", "globalchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4*", "globaluchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4*", "globalshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4*", "globalushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4*", "globalint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4*", "globaluint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4*", "globallong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4*", "globalulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4*", "globalfloat4p",
-        NULL
-    },
-    {
-        "global_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globaluchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globaluint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globallong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalfloat4restrictp",
-        NULL
-    },
-    {
-        "global_const_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char4*", "globalconstchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar4*", "globalconstuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short4*", "globalconstshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort4*", "globalconstushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int4*", "globalconstint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint4*", "globalconstuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long4*", "globalconstlong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong4*", "globalconstulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float4*", "globalconstfloat4p",
-        NULL
-    },
-    {
-        "global_const_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalconstchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globalconstuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalconstshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalconstushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalconstint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globalconstuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globalconstlong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalconstulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalconstfloat4restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "globalvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "globalvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "globalvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "globalvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "globalvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "globalvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "globalvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "globalvolatileulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "globalvolatilefloat4p",
-        NULL
-    },
-    {
-        "global_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globalvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globalvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globalvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalvolatileulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "globalconstvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "globalconstvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "globalconstvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "globalconstvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "globalconstvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "globalconstvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "globalconstvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "globalconstvolatileulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "globalconstvolatilefloat4p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "globalconstvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "globalconstvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "globalconstvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "globalconstvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "globalconstvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "globalconstvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "globalconstvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "globalconstvolatileulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "globalconstvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "local_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4*", "localchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4*", "localuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4*", "localshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4*", "localushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4*", "localint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4*", "localuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4*", "locallong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4*", "localulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4*", "localfloat4p",
-        NULL
-    },
-    {
-        "local_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "locallong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localfloat4restrictp",
-        NULL
-    },
-    {
-        "local_const_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char4*", "localconstchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar4*", "localconstuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short4*", "localconstshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort4*", "localconstushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int4*", "localconstint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint4*", "localconstuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long4*", "localconstlong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong4*", "localconstulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float4*", "localconstfloat4p",
-        NULL
-    },
-    {
-        "local_const_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localconstchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localconstuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localconstshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localconstushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localconstint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localconstuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "localconstlong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localconstulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localconstfloat4restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "localvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "localvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "localvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "localvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "localvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "localvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "localvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "localvolatileulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "localvolatilefloat4p",
-        NULL
-    },
-    {
-        "local_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "localvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localvolatileulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char4*", "localconstvolatilechar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar4*", "localconstvolatileuchar4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short4*", "localconstvolatileshort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort4*", "localconstvolatileushort4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int4*", "localconstvolatileint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint4*", "localconstvolatileuint4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long4*", "localconstvolatilelong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong4*", "localconstvolatileulong4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float4*", "localconstvolatilefloat4p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector4_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char4*", "localconstvolatilechar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar4*", "localconstvolatileuchar4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short4*", "localconstvolatileshort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort4*", "localconstvolatileushort4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int4*", "localconstvolatileint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint4*", "localconstvolatileuint4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long4*", "localconstvolatilelong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong4*", "localconstvolatileulong4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float4*", "localconstvolatilefloat4restrictp",
-        NULL
-    },
-    {
-        "vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "char4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "uchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "short4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "ushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "int4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "uint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "long4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "ulong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "float4d",
-        NULL
-    },
-    {
-        "const_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "constchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "constuchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "constshort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "constushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "constint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "constuint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "constlong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "constulong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "constfloat4d",
-        NULL
-    },
-    {
-        "private_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "privatechar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "privateuchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "privateshort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "privateushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "privateint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "privateuint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "privatelong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "privateulong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "privatefloat4d",
-        NULL
-    },
-    {
-        "private_const_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char4", "privateconstchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar4", "privateconstuchar4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short4", "privateconstshort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort4", "privateconstushort4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int4", "privateconstint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint4", "privateconstuint4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long4", "privateconstlong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong4", "privateconstulong4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float4", "privateconstfloat4d",
-        NULL
-    },
-    {
-        "constant_vector8_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char8*", "constantchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar8*", "constantuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short8*", "constantshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort8*", "constantushort8p",
-        NULL
-    },
-    {
-        "constant_vector8_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int8*", "constantint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint8*", "constantuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long8*", "constantlong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong8*", "constantulong8p",
-    NULL
-    },
-    {
-        "constant_vector8_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float8*", "constantfloat8p",
-        NULL
-    },
-    {
-        "constant_vector8_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "constantchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "constantuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "constantshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "constantushort8restrictp",
-        NULL
-    },
-    {
-        "constant_vector8_restrict_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "constantint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "constantuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "constantlong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "constantulong8restrictp",
-    NULL
-    },
-    {
-        "constant_vector8_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "constantfloat8restrictp",
-        NULL
-    },
-    {
-        "global_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8*", "globalchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8*", "globaluchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8*", "globalshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8*", "globalushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8*", "globalint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8*", "globaluint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8*", "globallong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8*", "globalulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8*", "globalfloat8p",
-        NULL
-    },
-    {
-        "global_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globaluchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globaluint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globallong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalfloat8restrictp",
-        NULL
-    },
-    {
-        "global_const_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char8*", "globalconstchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar8*", "globalconstuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short8*", "globalconstshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort8*", "globalconstushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int8*", "globalconstint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint8*", "globalconstuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long8*", "globalconstlong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong8*", "globalconstulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float8*", "globalconstfloat8p",
-        NULL
-    },
-    {
-        "global_const_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalconstchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globalconstuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalconstshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalconstushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalconstint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globalconstuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globalconstlong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalconstulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalconstfloat8restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "globalvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "globalvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "globalvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "globalvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "globalvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "globalvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "globalvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "globalvolatileulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "globalvolatilefloat8p",
-        NULL
-    },
-    {
-        "global_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globalvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globalvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globalvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalvolatileulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "globalconstvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "globalconstvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "globalconstvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "globalconstvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "globalconstvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "globalconstvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "globalconstvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "globalconstvolatileulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "globalconstvolatilefloat8p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "globalconstvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "globalconstvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "globalconstvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "globalconstvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "globalconstvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "globalconstvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "globalconstvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "globalconstvolatileulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "globalconstvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "local_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8*", "localchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8*", "localuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8*", "localshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8*", "localushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8*", "localint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8*", "localuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8*", "locallong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8*", "localulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8*", "localfloat8p",
-        NULL
-    },
-    {
-        "local_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "locallong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localfloat8restrictp",
-        NULL
-    },
-    {
-        "local_const_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char8*", "localconstchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar8*", "localconstuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short8*", "localconstshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort8*", "localconstushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int8*", "localconstint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint8*", "localconstuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long8*", "localconstlong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong8*", "localconstulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float8*", "localconstfloat8p",
-        NULL
-    },
-    {
-        "local_const_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localconstchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localconstuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localconstshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localconstushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localconstint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localconstuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "localconstlong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localconstulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localconstfloat8restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "localvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "localvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "localvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "localvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "localvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "localvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "localvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "localvolatileulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "localvolatilefloat8p",
-        NULL
-    },
-    {
-        "local_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "localvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localvolatileulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char8*", "localconstvolatilechar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar8*", "localconstvolatileuchar8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short8*", "localconstvolatileshort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort8*", "localconstvolatileushort8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int8*", "localconstvolatileint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint8*", "localconstvolatileuint8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long8*", "localconstvolatilelong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong8*", "localconstvolatileulong8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float8*", "localconstvolatilefloat8p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector8_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char8*", "localconstvolatilechar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar8*", "localconstvolatileuchar8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short8*", "localconstvolatileshort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort8*", "localconstvolatileushort8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int8*", "localconstvolatileint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint8*", "localconstvolatileuint8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long8*", "localconstvolatilelong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong8*", "localconstvolatileulong8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float8*", "localconstvolatilefloat8restrictp",
-        NULL
-    },
-    {
-        "vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "char8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "uchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "short8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "ushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "int8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "uint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "long8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "ulong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "float8d",
-        NULL
-    },
-    {
-        "const_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "constchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "constuchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "constshort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "constushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "constint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "constuint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "constlong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "constulong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "constfloat8d",
-        NULL
-    },
-    {
-        "private_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "privatechar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "privateuchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "privateshort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "privateushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "privateint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "privateuint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "privatelong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "privateulong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "privatefloat8d",
-        NULL
-    },
-    {
-        "private_const_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char8", "privateconstchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar8", "privateconstuchar8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short8", "privateconstshort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort8", "privateconstushort8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int8", "privateconstint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint8", "privateconstuint8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long8", "privateconstlong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong8", "privateconstulong8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float8", "privateconstfloat8d",
-        NULL
-    },
-    {
-        "constant_vector16_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char16*", "constantchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar16*", "constantuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short16*", "constantshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort16*", "constantushort16p",
-        NULL
-    },
-    {
-        "constant_vector16_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int16*", "constantint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint16*", "constantuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long16*", "constantlong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong16*", "constantulong16p",
-    NULL
-    },
-    {
-        "constant_vector16_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float16*", "constantfloat16p",
-        NULL
-    },
-    {
-        "constant_vector16_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "constantchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "constantuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "constantshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "constantushort16restrictp",
-        NULL
-    },
-    {
-        "constant_vector16_restrict_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "constantint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "constantuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "constantlong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "constantulong16restrictp",
-    NULL
-    },
-    {
-        "constant_vector16_restrict_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "constantfloat16restrictp",
-        NULL
-    },
-    {
-        "global_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16*", "globalchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16*", "globaluchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16*", "globalshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16*", "globalushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16*", "globalint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16*", "globaluint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16*", "globallong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16*", "globalulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16*", "globalfloat16p",
-        NULL
-    },
-    {
-        "global_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globaluchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globaluint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globallong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalfloat16restrictp",
-        NULL
-    },
-    {
-        "global_const_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char16*", "globalconstchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar16*", "globalconstuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short16*", "globalconstshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort16*", "globalconstushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int16*", "globalconstint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint16*", "globalconstuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long16*", "globalconstlong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong16*", "globalconstulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float16*", "globalconstfloat16p",
-        NULL
-    },
-    {
-        "global_const_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalconstchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globalconstuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalconstshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalconstushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalconstint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globalconstuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globalconstlong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalconstulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalconstfloat16restrictp",
-        NULL
-    },
-    {
-        "global_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "globalvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "globalvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "globalvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "globalvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "globalvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "globalvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "globalvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "globalvolatileulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "globalvolatilefloat16p",
-        NULL
-    },
-    {
-        "global_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globalvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globalvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globalvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalvolatileulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "globalconstvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "globalconstvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "globalconstvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "globalconstvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "globalconstvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "globalconstvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "globalconstvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "globalconstvolatileulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "globalconstvolatilefloat16p",
-        NULL
-    },
-    {
-        "global_const_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "globalconstvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "globalconstvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "globalconstvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "globalconstvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "globalconstvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "globalconstvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "globalconstvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "globalconstvolatileulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "globalconstvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "local_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16*", "localchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16*", "localuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16*", "localshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16*", "localushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16*", "localint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16*", "localuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16*", "locallong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16*", "localulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16*", "localfloat16p",
-        NULL
-    },
-    {
-        "local_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "locallong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localfloat16restrictp",
-        NULL
-    },
-    {
-        "local_const_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "char16*", "localconstchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uchar16*", "localconstuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "short16*", "localconstshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ushort16*", "localconstushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "int16*", "localconstint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "uint16*", "localconstuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "long16*", "localconstlong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "ulong16*", "localconstulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "float16*", "localconstfloat16p",
-        NULL
-    },
-    {
-        "local_const_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localconstchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localconstuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localconstshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localconstushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localconstint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localconstuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "localconstlong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localconstulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localconstfloat16restrictp",
-        NULL
-    },
-    {
-        "local_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "localvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "localvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "localvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "localvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "localvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "localvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "localvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "localvolatileulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "localvolatilefloat16p",
-        NULL
-    },
-    {
-        "local_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "localvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localvolatileulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "char16*", "localconstvolatilechar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uchar16*", "localconstvolatileuchar16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "short16*", "localconstvolatileshort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ushort16*", "localconstvolatileushort16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "int16*", "localconstvolatileint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "uint16*", "localconstvolatileuint16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "long16*", "localconstvolatilelong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "ulong16*", "localconstvolatileulong16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "float16*", "localconstvolatilefloat16p",
-        NULL
-    },
-    {
-        "local_const_volatile_vector16_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "char16*", "localconstvolatilechar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uchar16*", "localconstvolatileuchar16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "short16*", "localconstvolatileshort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ushort16*", "localconstvolatileushort16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "int16*", "localconstvolatileint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "uint16*", "localconstvolatileuint16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "long16*", "localconstvolatilelong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "ulong16*", "localconstvolatileulong16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "float16*", "localconstvolatilefloat16restrictp",
-        NULL
-    },
-    {
-        "vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "char16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "uchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "short16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "ushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "int16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "uint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "long16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "ulong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "float16d",
-        NULL
-    },
-    {
-        "const_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "constchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "constuchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "constshort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "constushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "constint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "constuint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "constlong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "constulong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "constfloat16d",
-        NULL
-    },
-    {
-        "private_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "privatechar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "privateuchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "privateshort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "privateushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "privateint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "privateuint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "privatelong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "privateulong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "privatefloat16d",
-        NULL
-    },
-    {
-        "private_const_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "char16", "privateconstchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uchar16", "privateconstuchar16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "short16", "privateconstshort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ushort16", "privateconstushort16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "int16", "privateconstint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "uint16", "privateconstuint16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "long16", "privateconstlong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "ulong16", "privateconstulong16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "float16", "privateconstfloat16d",
-        NULL
-    },
-    {
-        "constant_derived_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_type*", "constanttypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "struct struct_type*", "constantstructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_struct_type*", "constanttypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "union union_type*", "constantunionunion_typep",
-        NULL
-    },
-    {
-        "constant_derived_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_union_type*", "constanttypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "enum enum_type*", "constantenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_enum_type*", "constanttypedef_enum_typep",
-        NULL
-    },
-    {
-        "constant_derived_restrict_p0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "constanttypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "constantstructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "constanttypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "constantunionunion_typerestrictp",
-        NULL
-    },
-    {
-        "constant_derived_restrict_p1",
-    (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "constanttypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "constantenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "constanttypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type*", "globaltypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type*", "globalstructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type*", "globaltypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type*", "globalunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type*", "globaltypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type*", "globalenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type*", "globaltypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globaltypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalstructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globaltypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globaltypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globaltypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_const_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_type*", "globalconsttypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "struct struct_type*", "globalconststructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_struct_type*", "globalconsttypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "union union_type*", "globalconstunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_union_type*", "globalconsttypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "enum enum_type*", "globalconstenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_enum_type*", "globalconsttypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_const_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globalconsttypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalconststructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globalconsttypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalconstunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globalconsttypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalconstenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globalconsttypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "globalvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "globalvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "globalvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "globalvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "globalvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "globalvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "globalvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globalvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globalvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globalvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globalvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "global_const_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "globalconstvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "globalconstvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "globalconstvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "globalconstvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "globalconstvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "globalconstvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "globalconstvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "global_const_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "globalconstvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "globalconstvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "globalconstvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "globalconstvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "globalconstvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "globalconstvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "globalconstvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type*", "localtypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type*", "localstructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type*", "localtypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type*", "localunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type*", "localtypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type*", "localenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type*", "localtypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localtypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localstructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localtypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localtypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localtypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_const_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_type*", "localconsttypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "struct struct_type*", "localconststructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_struct_type*", "localconsttypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "union union_type*", "localconstunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_union_type*", "localconsttypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "enum enum_type*", "localconstenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "typedef_enum_type*", "localconsttypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_const_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localconsttypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localconststructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localconsttypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localconstunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localconsttypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localconstenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localconsttypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "localvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "localvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "localvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "localvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "localvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "localvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "localvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "local_const_volatile_derived_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_type*", "localconstvolatiletypedef_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "struct struct_type*", "localconstvolatilestructstruct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_struct_type*", "localconstvolatiletypedef_struct_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "union union_type*", "localconstvolatileunionunion_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_union_type*", "localconstvolatiletypedef_union_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "enum enum_type*", "localconstvolatileenumenum_typep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "typedef_enum_type*", "localconstvolatiletypedef_enum_typep",
-        NULL
-    },
-    {
-        "local_const_volatile_derived_restrict_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_type*", "localconstvolatiletypedef_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "struct struct_type*", "localconstvolatilestructstruct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_struct_type*", "localconstvolatiletypedef_struct_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "union union_type*", "localconstvolatileunionunion_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_union_type*", "localconstvolatiletypedef_union_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "enum enum_type*", "localconstvolatileenumenum_typerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "typedef_enum_type*", "localconstvolatiletypedef_enum_typerestrictp",
-        NULL
-    },
-    {
-        "derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "typedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "structstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "typedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "unionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "typedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "enumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "typedef_enum_typed",
-        NULL
-    },
-    {
-        "const_derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "consttypedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "conststructstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "consttypedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "constunionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "consttypedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "constenumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "consttypedef_enum_typed",
-        NULL
-    },
-    {
-        "private_derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "privatetypedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "privatestructstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "privatetypedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "privateunionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "privatetypedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "privateenumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "privatetypedef_enum_typed",
-        NULL
-    },
-    {
-        "private_const_derived_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_type", "privateconsttypedef_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "struct struct_type", "privateconststructstruct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_struct_type", "privateconsttypedef_struct_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "union union_type", "privateconstunionunion_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_union_type", "privateconsttypedef_union_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "enum enum_type", "privateconstenumenum_typed",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "typedef_enum_type", "privateconsttypedef_enum_typed",
-        NULL
-    },
-};
-
-// Support for optional image data type
-static const char * image_kernel_args[] = {
-    "#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable\n"
-    "kernel void image_d(read_only image2d_t image2d_td0,\n"
-    "                    write_only image2d_t image2d_td1,\n"
-    "                    read_only image3d_t image3d_td2,\n"
-    "                    write_only image3d_t image3d_td3,\n"
-    "                    read_only image2d_array_t image2d_array_td4,\n"
-    "                    write_only image2d_array_t image2d_array_td5,\n"
-    "                    read_only image1d_t image1d_td6,\n"
-    "                    write_only image1d_t image1d_td7,\n"
-    "                    read_only image1d_buffer_t image1d_buffer_td8,\n"
-    "                    write_only image1d_buffer_t image1d_buffer_td9,\n"
-    "                    read_only image1d_array_t image1d_array_td10,\n"
-    "                    write_only image1d_array_t image1d_array_td11,\n"
-    "                    sampler_t sampler_td12)\n"
-    "{}\n",
-    "\n"
-};
-
-static const char * image_arg_info[][67] = {
-    {
-        "image_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_t", "image2d_td0",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_t", "image2d_td1",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image3d_t", "image3d_td2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image3d_t", "image3d_td3",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_array_t", "image2d_array_td4",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image2d_array_t", "image2d_array_td5",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_t", "image1d_td6",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_t", "image1d_td7",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_buffer_t", "image1d_buffer_td8",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_buffer_t", "image1d_buffer_td9",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_READ_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_array_t", "image1d_array_td10",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_WRITE_ONLY, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "image1d_array_t", "image1d_array_td11",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "sampler_t", "sampler_td12",
-        NULL
-    },
-};
-
-// Support for optional double data type
-static const char * double_kernel_args[] = {
-    "kernel void double_scalar_p(constant double*constantdoublep,\n"
-    "                            constant double *restrict constantdoublerestrictp,\n"
-    "                            global double*globaldoublep,\n"
-    "                            global double *restrict globaldoublerestrictp,\n"
-    "                            global const double* globalconstdoublep,\n"
-    "                            global const double * restrict globalconstdoublerestrictp,\n"
-    "                            global volatile double*globalvolatiledoublep,\n"
-    "                            global volatile double *restrict globalvolatiledoublerestrictp,\n"
-    "                            global const volatile double* globalconstvolatiledoublep)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_scalar_p2(global const volatile double * restrict globalconstvolatiledoublerestrictp,\n"
-    "                             local double*localdoublep,\n"
-    "                             local double *restrict localdoublerestrictp,\n"
-    "                             local const double* localconstdoublep,\n"
-    "                             local const double * restrict localconstdoublerestrictp,\n"
-    "                             local volatile double*localvolatiledoublep,\n"
-    "                             local volatile double *restrict localvolatiledoublerestrictp,\n"
-    "                             local const volatile double* localconstvolatiledoublep,\n"
-    "                             local const volatile double * restrict localconstvolatiledoublerestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_scalar_d(double doubled,\n"
-    "                            const double constdoubled,\n"
-    "                            private double privatedoubled,\n"
-    "                            private const double privateconstdoubled)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector2_p(constant double2*constantdouble2p,\n"
-    "                             constant double2 *restrict constantdouble2restrictp,\n"
-    "                             global double2*globaldouble2p,\n"
-    "                             global double2 *restrict globaldouble2restrictp,\n"
-    "                             global const double2* globalconstdouble2p,\n"
-    "                             global const double2 * restrict globalconstdouble2restrictp,\n"
-    "                             global volatile double2*globalvolatiledouble2p,\n"
-    "                             global volatile double2 *restrict globalvolatiledouble2restrictp,\n"
-    "                             global const volatile double2* globalconstvolatiledouble2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector2_p2(global const volatile double2 * restrict globalconstvolatiledouble2restrictp,\n"
-    "                              local double2*localdouble2p,\n"
-    "                              local double2 *restrict localdouble2restrictp,\n"
-    "                              local const double2* localconstdouble2p,\n"
-    "                              local const double2 * restrict localconstdouble2restrictp,\n"
-    "                              local volatile double2*localvolatiledouble2p,\n"
-    "                              local volatile double2 *restrict localvolatiledouble2restrictp,\n"
-    "                              local const volatile double2* localconstvolatiledouble2p,\n"
-    "                              local const volatile double2 * restrict localconstvolatiledouble2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector2_d(double2 double2d,\n"
-    "                             const double2 constdouble2d,\n"
-    "                             private double2 privatedouble2d,\n"
-    "                             private const double2 privateconstdouble2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector3_p(constant double3*constantdouble3p,\n"
-    "                             constant double3 *restrict constantdouble3restrictp,\n"
-    "                             global double3*globaldouble3p,\n"
-    "                             global double3 *restrict globaldouble3restrictp,\n"
-    "                             global const double3* globalconstdouble3p,\n"
-    "                             global const double3 * restrict globalconstdouble3restrictp,\n"
-    "                             global volatile double3*globalvolatiledouble3p,\n"
-    "                             global volatile double3 *restrict globalvolatiledouble3restrictp,\n"
-    "                             global const volatile double3* globalconstvolatiledouble3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector3_p2(global const volatile double3 * restrict globalconstvolatiledouble3restrictp,\n"
-    "                              local double3*localdouble3p,\n"
-    "                              local double3 *restrict localdouble3restrictp,\n"
-    "                              local const double3* localconstdouble3p,\n"
-    "                              local const double3 * restrict localconstdouble3restrictp,\n"
-    "                              local volatile double3*localvolatiledouble3p,\n"
-    "                              local volatile double3 *restrict localvolatiledouble3restrictp,\n"
-    "                              local const volatile double3* localconstvolatiledouble3p,\n"
-    "                              local const volatile double3 * restrict localconstvolatiledouble3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector3_d(double3 double3d,\n"
-    "                             const double3 constdouble3d,\n"
-    "                             private double3 privatedouble3d,\n"
-    "                             private const double3 privateconstdouble3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector4_p(constant double4*constantdouble4p,\n"
-    "                             constant double4 *restrict constantdouble4restrictp,\n"
-    "                             global double4*globaldouble4p,\n"
-    "                             global double4 *restrict globaldouble4restrictp,\n"
-    "                             global const double4* globalconstdouble4p,\n"
-    "                             global const double4 * restrict globalconstdouble4restrictp,\n"
-    "                             global volatile double4*globalvolatiledouble4p,\n"
-    "                             global volatile double4 *restrict globalvolatiledouble4restrictp,\n"
-    "                             global const volatile double4* globalconstvolatiledouble4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector4_p2(global const volatile double4 * restrict globalconstvolatiledouble4restrictp,\n"
-    "                              local double4*localdouble4p,\n"
-    "                              local double4 *restrict localdouble4restrictp,\n"
-    "                              local const double4* localconstdouble4p,\n"
-    "                              local const double4 * restrict localconstdouble4restrictp,\n"
-    "                              local volatile double4*localvolatiledouble4p,\n"
-    "                              local volatile double4 *restrict localvolatiledouble4restrictp,\n"
-    "                              local const volatile double4* localconstvolatiledouble4p,\n"
-    "                              local const volatile double4 * restrict localconstvolatiledouble4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector4_d(double4 double4d,\n"
-    "                             const double4 constdouble4d,\n"
-    "                             private double4 privatedouble4d,\n"
-    "                             private const double4 privateconstdouble4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector8_p(constant double8*constantdouble8p,\n"
-    "                             constant double8 *restrict constantdouble8restrictp,\n"
-    "                             global double8*globaldouble8p,\n"
-    "                             global double8 *restrict globaldouble8restrictp,\n"
-    "                             global const double8* globalconstdouble8p,\n"
-    "                             global const double8 * restrict globalconstdouble8restrictp,\n"
-    "                             global volatile double8*globalvolatiledouble8p,\n"
-    "                             global volatile double8 *restrict globalvolatiledouble8restrictp,\n"
-    "                             global const volatile double8* globalconstvolatiledouble8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector8_p2(global const volatile double8 * restrict globalconstvolatiledouble8restrictp,\n"
-    "                              local double8*localdouble8p,\n"
-    "                              local double8 *restrict localdouble8restrictp,\n"
-    "                              local const double8* localconstdouble8p,\n"
-    "                              local const double8 * restrict localconstdouble8restrictp,\n"
-    "                              local volatile double8*localvolatiledouble8p,\n"
-    "                              local volatile double8 *restrict localvolatiledouble8restrictp,\n"
-    "                              local const volatile double8* localconstvolatiledouble8p,\n"
-    "                              local const volatile double8 * restrict localconstvolatiledouble8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector8_d(double8 double8d,\n"
-    "                             const double8 constdouble8d,\n"
-    "                             private double8 privatedouble8d,\n"
-    "                             private const double8 privateconstdouble8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector16_p(constant double16*constantdouble16p,\n"
-    "                              constant double16 *restrict constantdouble16restrictp,\n"
-    "                              global double16*globaldouble16p,\n"
-    "                              global double16 *restrict globaldouble16restrictp,\n"
-    "                              global const double16* globalconstdouble16p,\n"
-    "                              global const double16 * restrict globalconstdouble16restrictp,\n"
-    "                              global volatile double16*globalvolatiledouble16p,\n"
-    "                              global volatile double16 *restrict globalvolatiledouble16restrictp,\n"
-    "                              global const volatile double16* globalconstvolatiledouble16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector16_p2(global const volatile double16 * restrict globalconstvolatiledouble16restrictp,\n"
-    "                               local double16*localdouble16p,\n"
-    "                               local double16 *restrict localdouble16restrictp,\n"
-    "                               local const double16* localconstdouble16p,\n"
-    "                               local const double16 * restrict localconstdouble16restrictp,\n"
-    "                               local volatile double16*localvolatiledouble16p,\n"
-    "                               local volatile double16 *restrict localvolatiledouble16restrictp,\n"
-    "                               local const volatile double16* localconstvolatiledouble16p,\n"
-    "                               local const volatile double16 * restrict localconstvolatiledouble16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void double_vector16_d(double16 double16d,\n"
-    "                              const double16 constdouble16d,\n"
-    "                              private double16 privatedouble16d,\n"
-    "                              private const double16 privateconstdouble16d)\n"
-    "{}\n",
-    "\n"
-};
-
-static const char * double_arg_info[][77] = {
-    {
-        "double_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double*", "constantdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "constantdoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double*", "globaldoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globaldoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double*", "globalconstdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globalconstdoublerestrictp",
-    (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "globalvolatiledoublep",
-    (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globalvolatiledoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "globalconstvolatiledoublep",
-        NULL
-    },
-    {
-        "double_scalar_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "globalconstvolatiledoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double*", "localdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localdoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double*", "localconstdoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localconstdoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "localvolatiledoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localvolatiledoublerestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double*", "localconstvolatiledoublep",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double*", "localconstvolatiledoublerestrictp",
-        NULL
-    },
-    {
-        "double_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "doubled",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "constdoubled",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "privatedoubled",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double", "privateconstdoubled",
-        NULL
-    },
-    {
-        "double_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double2*", "constantdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "constantdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2*", "globaldouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globaldouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double2*", "globalconstdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globalconstdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "globalvolatiledouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globalvolatiledouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "globalconstvolatiledouble2p",
-        NULL
-    },
-    {
-        "double_vector2_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "globalconstvolatiledouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2*", "localdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double2*", "localconstdouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localconstdouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "localvolatiledouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localvolatiledouble2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double2*", "localconstvolatiledouble2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double2*", "localconstvolatiledouble2restrictp",
-        NULL
-    },
-    {
-        "double_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "double2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "constdouble2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "privatedouble2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double2", "privateconstdouble2d",
-        NULL
-    },
-    {
-        "double_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double3*", "constantdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "constantdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3*", "globaldouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globaldouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double3*", "globalconstdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globalconstdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "globalvolatiledouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globalvolatiledouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "globalconstvolatiledouble3p",
-        NULL
-    },
-    {
-        "double_vector3_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "globalconstvolatiledouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3*", "localdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double3*", "localconstdouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localconstdouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "localvolatiledouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localvolatiledouble3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double3*", "localconstvolatiledouble3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double3*", "localconstvolatiledouble3restrictp",
-        NULL
-    },
-    {
-        "double_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "double3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "constdouble3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "privatedouble3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double3", "privateconstdouble3d",
-        NULL
-    },
-    {
-        "double_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double4*", "constantdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "constantdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4*", "globaldouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globaldouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double4*", "globalconstdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globalconstdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "globalvolatiledouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globalvolatiledouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "globalconstvolatiledouble4p",
-        NULL
-    },
-    {
-        "double_vector4_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "globalconstvolatiledouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4*", "localdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double4*", "localconstdouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localconstdouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "localvolatiledouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localvolatiledouble4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double4*", "localconstvolatiledouble4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double4*", "localconstvolatiledouble4restrictp",
-        NULL
-    },
-    {
-        "double_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "double4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "constdouble4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "privatedouble4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double4", "privateconstdouble4d",
-        NULL
-    },
-    {
-        "double_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double8*", "constantdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "constantdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8*", "globaldouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globaldouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double8*", "globalconstdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globalconstdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "globalvolatiledouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globalvolatiledouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "globalconstvolatiledouble8p",
-        NULL
-    },
-    {
-        "double_vector8_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "globalconstvolatiledouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8*", "localdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double8*", "localconstdouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localconstdouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "localvolatiledouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localvolatiledouble8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double8*", "localconstvolatiledouble8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double8*", "localconstvolatiledouble8restrictp",
-        NULL
-    },
-    {
-        "double_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "double8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "constdouble8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "privatedouble8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double8", "privateconstdouble8d",
-        NULL
-    },
-    {
-        "double_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double16*", "constantdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "constantdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16*", "globaldouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globaldouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double16*", "globalconstdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globalconstdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "globalvolatiledouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globalvolatiledouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "globalconstvolatiledouble16p",
-        NULL
-    },
-    {
-        "double_vector16_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "globalconstvolatiledouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16*", "localdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "double16*", "localconstdouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localconstdouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "localvolatiledouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localvolatiledouble16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "double16*", "localconstvolatiledouble16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "double16*", "localconstvolatiledouble16restrictp",
-        NULL
-    },
-    {
-        "double_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "double16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "constdouble16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "privatedouble16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "double16", "privateconstdouble16d",
-        NULL
-    },
-};
-
-
-// Support for optional half data type
-static const char * half_kernel_args[] = {
-    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-    "\n"
-    "kernel void half_scalar_p(constant half*constanthalfp,\n"
-    "                          constant half *restrict constanthalfrestrictp,\n"
-    "                          global half*globalhalfp,\n"
-    "                          global half *restrict globalhalfrestrictp,\n"
-    "                          global const half* globalconsthalfp,\n"
-    "                          global const half * restrict globalconsthalfrestrictp,\n"
-    "                          global volatile half*globalvolatilehalfp,\n"
-    "                          global volatile half *restrict globalvolatilehalfrestrictp,\n"
-    "                          global const volatile half* globalconstvolatilehalfp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_scalar_p2(global const volatile half * restrict globalconstvolatilehalfrestrictp,\n"
-    "                           local half*localhalfp,\n"
-    "                           local half *restrict localhalfrestrictp,\n"
-    "                           local const half* localconsthalfp,\n"
-    "                           local const half * restrict localconsthalfrestrictp,\n"
-    "                           local volatile half*localvolatilehalfp,\n"
-    "                           local volatile half *restrict localvolatilehalfrestrictp,\n"
-    "                           local const volatile half* localconstvolatilehalfp,\n"
-    "                           local const volatile half * restrict localconstvolatilehalfrestrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_scalar_d(half halfd,\n"
-    "                          const half consthalfd,\n"
-    "                          private half privatehalfd,\n"
-    "                          private const half privateconsthalfd)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector2_p(constant half2*constanthalf2p,\n"
-    "                           constant half2 *restrict constanthalf2restrictp,\n"
-    "                           global half2*globalhalf2p,\n"
-    "                           global half2 *restrict globalhalf2restrictp,\n"
-    "                           global const half2* globalconsthalf2p,\n"
-    "                           global const half2 * restrict globalconsthalf2restrictp,\n"
-    "                           global volatile half2*globalvolatilehalf2p,\n"
-    "                           global volatile half2 *restrict globalvolatilehalf2restrictp,\n"
-    "                           global const volatile half2* globalconstvolatilehalf2p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector2_p2(global const volatile half2 * restrict globalconstvolatilehalf2restrictp,\n"
-    "                            local half2*localhalf2p,\n"
-    "                            local half2 *restrict localhalf2restrictp,\n"
-    "                            local const half2* localconsthalf2p,\n"
-    "                            local const half2 * restrict localconsthalf2restrictp,\n"
-    "                            local volatile half2*localvolatilehalf2p,\n"
-    "                            local volatile half2 *restrict localvolatilehalf2restrictp,\n"
-    "                            local const volatile half2* localconstvolatilehalf2p,\n"
-    "                            local const volatile half2 * restrict localconstvolatilehalf2restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector2_d(half2 half2d,\n"
-    "                           const half2 consthalf2d,\n"
-    "                           private half2 privatehalf2d,\n"
-    "                           private const half2 privateconsthalf2d)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector3_p(constant half3*constanthalf3p,\n"
-    "                           constant half3 *restrict constanthalf3restrictp,\n"
-    "                           global half3*globalhalf3p,\n"
-    "                           global half3 *restrict globalhalf3restrictp,\n"
-    "                           global const half3* globalconsthalf3p,\n"
-    "                           global const half3 * restrict globalconsthalf3restrictp,\n"
-    "                           global volatile half3*globalvolatilehalf3p,\n"
-    "                           global volatile half3 *restrict globalvolatilehalf3restrictp,\n"
-    "                           global const volatile half3* globalconstvolatilehalf3p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector3_p2(global const volatile half3 * restrict globalconstvolatilehalf3restrictp,\n"
-    "                            local half3*localhalf3p,\n"
-    "                            local half3 *restrict localhalf3restrictp,\n"
-    "                            local const half3* localconsthalf3p,\n"
-    "                            local const half3 * restrict localconsthalf3restrictp,\n"
-    "                            local volatile half3*localvolatilehalf3p,\n"
-    "                            local volatile half3 *restrict localvolatilehalf3restrictp,\n"
-    "                            local const volatile half3* localconstvolatilehalf3p,\n"
-    "                            local const volatile half3 * restrict localconstvolatilehalf3restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector3_d(half3 half3d,\n"
-    "                           const half3 consthalf3d,\n"
-    "                           private half3 privatehalf3d,\n"
-    "                           private const half3 privateconsthalf3d)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector4_p(constant half4*constanthalf4p,\n"
-    "                           constant half4 *restrict constanthalf4restrictp,\n"
-    "                           global half4*globalhalf4p,\n"
-    "                           global half4 *restrict globalhalf4restrictp,\n"
-    "                           global const half4* globalconsthalf4p,\n"
-    "                           global const half4 * restrict globalconsthalf4restrictp,\n"
-    "                           global volatile half4*globalvolatilehalf4p,\n"
-    "                           global volatile half4 *restrict globalvolatilehalf4restrictp,\n"
-    "                           global const volatile half4* globalconstvolatilehalf4p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector4_p2(global const volatile half4 * restrict globalconstvolatilehalf4restrictp,\n"
-    "                            local half4*localhalf4p,\n"
-    "                            local half4 *restrict localhalf4restrictp,\n"
-    "                            local const half4* localconsthalf4p,\n"
-    "                            local const half4 * restrict localconsthalf4restrictp,\n"
-    "                            local volatile half4*localvolatilehalf4p,\n"
-    "                            local volatile half4 *restrict localvolatilehalf4restrictp,\n"
-    "                            local const volatile half4* localconstvolatilehalf4p,\n"
-    "                            local const volatile half4 * restrict localconstvolatilehalf4restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector4_d(half4 half4d,\n"
-    "                           const half4 consthalf4d,\n"
-    "                           private half4 privatehalf4d,\n"
-    "                           private const half4 privateconsthalf4d)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector8_p(constant half8*constanthalf8p,\n"
-    "                           constant half8 *restrict constanthalf8restrictp,\n"
-    "                           global half8*globalhalf8p,\n"
-    "                           global half8 *restrict globalhalf8restrictp,\n"
-    "                           global const half8* globalconsthalf8p,\n"
-    "                           global const half8 * restrict globalconsthalf8restrictp,\n"
-    "                           global volatile half8*globalvolatilehalf8p,\n"
-    "                           global volatile half8 *restrict globalvolatilehalf8restrictp,\n"
-    "                           global const volatile half8* globalconstvolatilehalf8p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector8_p2(global const volatile half8 * restrict globalconstvolatilehalf8restrictp,\n"
-    "                            local half8*localhalf8p,\n"
-    "                            local half8 *restrict localhalf8restrictp,\n"
-    "                            local const half8* localconsthalf8p,\n"
-    "                            local const half8 * restrict localconsthalf8restrictp,\n"
-    "                            local volatile half8*localvolatilehalf8p,\n"
-    "                            local volatile half8 *restrict localvolatilehalf8restrictp,\n"
-    "                            local const volatile half8* localconstvolatilehalf8p,\n"
-    "                            local const volatile half8 * restrict localconstvolatilehalf8restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector8_d(half8 half8d,\n"
-    "                           const half8 consthalf8d,\n"
-    "                           private half8 privatehalf8d,\n"
-    "                           private const half8 privateconsthalf8d)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector16_p(constant half16*constanthalf16p,\n"
-    "                            constant half16 *restrict constanthalf16restrictp,\n"
-    "                            global half16*globalhalf16p,\n"
-    "                            global half16 *restrict globalhalf16restrictp,\n"
-    "                            global const half16* globalconsthalf16p,\n"
-    "                            global const half16 * restrict globalconsthalf16restrictp,\n"
-    "                            global volatile half16*globalvolatilehalf16p,\n"
-    "                            global volatile half16 *restrict globalvolatilehalf16restrictp,\n"
-    "                            global const volatile half16* globalconstvolatilehalf16p)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector16_p2(global const volatile half16 * restrict globalconstvolatilehalf16restrictp,\n"
-    "                             local half16*localhalf16p,\n"
-    "                             local half16 *restrict localhalf16restrictp,\n"
-    "                             local const half16* localconsthalf16p,\n"
-    "                             local const half16 * restrict localconsthalf16restrictp,\n"
-    "                             local volatile half16*localvolatilehalf16p,\n"
-    "                             local volatile half16 *restrict localvolatilehalf16restrictp,\n"
-    "                             local const volatile half16* localconstvolatilehalf16p,\n"
-    "                             local const volatile half16 * restrict localconstvolatilehalf16restrictp)\n"
-    "{}\n",
-    "\n"
-    "kernel void half_vector16_d(half16 half16d,\n"
-    "                            const half16 consthalf16d,\n"
-    "                            private half16 privatehalf16d,\n"
-    "                            private const half16 privateconsthalf16d)\n"
-    "{}\n",
-    "\n"
-};
-
-static const char * half_arg_info[][77] = {
-    {
-        "half_scalar_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half*", "constanthalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "constanthalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half*", "globalhalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalhalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half*", "globalconsthalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalconsthalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "globalvolatilehalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalvolatilehalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "globalconstvolatilehalfp",
-        NULL
-    },
-    {
-        "half_scalar_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "globalconstvolatilehalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half*", "localhalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localhalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half*", "localconsthalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localconsthalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "localvolatilehalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localvolatilehalfrestrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half*", "localconstvolatilehalfp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half*", "localconstvolatilehalfrestrictp",
-        NULL
-    },
-    {
-        "half_scalar_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half", "halfd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half", "consthalfd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half", "privatehalfd",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half", "privateconsthalfd",
-        NULL
-    },
-    {
-        "half_vector2_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half2*", "constanthalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "constanthalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2*", "globalhalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalhalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half2*", "globalconsthalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalconsthalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "globalvolatilehalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalvolatilehalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "globalconstvolatilehalf2p",
-        NULL
-    },
-    {
-        "half_vector2_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "globalconstvolatilehalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2*", "localhalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localhalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half2*", "localconsthalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localconsthalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "localvolatilehalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localvolatilehalf2restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half2*", "localconstvolatilehalf2p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half2*", "localconstvolatilehalf2restrictp",
-        NULL
-    },
-    {
-        "half_vector2_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2", "half2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2", "consthalf2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2", "privatehalf2d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half2", "privateconsthalf2d",
-        NULL
-    },
-    {
-        "half_vector3_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half3*", "constanthalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "constanthalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3*", "globalhalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalhalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half3*", "globalconsthalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalconsthalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "globalvolatilehalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalvolatilehalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "globalconstvolatilehalf3p",
-        NULL
-    },
-    {
-        "half_vector3_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "globalconstvolatilehalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3*", "localhalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localhalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half3*", "localconsthalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localconsthalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "localvolatilehalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localvolatilehalf3restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half3*", "localconstvolatilehalf3p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half3*", "localconstvolatilehalf3restrictp",
-        NULL
-    },
-    {
-        "half_vector3_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3", "half3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3", "consthalf3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3", "privatehalf3d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half3", "privateconsthalf3d",
-        NULL
-    },
-    {
-        "half_vector4_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half4*", "constanthalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "constanthalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4*", "globalhalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalhalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half4*", "globalconsthalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalconsthalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "globalvolatilehalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalvolatilehalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "globalconstvolatilehalf4p",
-        NULL
-    },
-    {
-        "half_vector4_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "globalconstvolatilehalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4*", "localhalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localhalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half4*", "localconsthalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localconsthalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "localvolatilehalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localvolatilehalf4restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half4*", "localconstvolatilehalf4p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half4*", "localconstvolatilehalf4restrictp",
-        NULL
-    },
-    {
-        "half_vector4_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4", "half4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4", "consthalf4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4", "privatehalf4d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half4", "privateconsthalf4d",
-        NULL
-    },
-    {
-        "half_vector8_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half8*", "constanthalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "constanthalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8*", "globalhalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalhalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half8*", "globalconsthalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalconsthalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "globalvolatilehalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalvolatilehalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "globalconstvolatilehalf8p",
-        NULL
-    },
-    {
-        "half_vector8_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "globalconstvolatilehalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8*", "localhalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localhalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half8*", "localconsthalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localconsthalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "localvolatilehalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localvolatilehalf8restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half8*", "localconstvolatilehalf8p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half8*", "localconstvolatilehalf8restrictp",
-        NULL
-    },
-    {
-        "half_vector8_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8", "half8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8", "consthalf8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8", "privatehalf8d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half8", "privateconsthalf8d",
-        NULL
-    },
-    {
-        "half_vector16_p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half16*", "constanthalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_CONSTANT, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "constanthalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16*", "globalhalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalhalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half16*", "globalconsthalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalconsthalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "globalvolatilehalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalvolatilehalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "globalconstvolatilehalf16p",
-        NULL
-    },
-    {
-        "half_vector16_p2",
-        (const char *)CL_KERNEL_ARG_ADDRESS_GLOBAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "globalconstvolatilehalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16*", "localhalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localhalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST), "half16*", "localconsthalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localconsthalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "localvolatilehalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localvolatilehalf16restrictp",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE), "half16*", "localconstvolatilehalf16p",
-        (const char *)CL_KERNEL_ARG_ADDRESS_LOCAL, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE|CL_KERNEL_ARG_TYPE_RESTRICT), "half16*", "localconstvolatilehalf16restrictp",
-        NULL
-    },
-    {
-        "half_vector16_d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16", "half16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16", "consthalf16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16", "privatehalf16d",
-        (const char *)CL_KERNEL_ARG_ADDRESS_PRIVATE, (const char *)CL_KERNEL_ARG_ACCESS_NONE, (const char *)(CL_KERNEL_ARG_TYPE_NONE), "half16", "privateconsthalf16d",
-        NULL
-    },
-};
-
-
-template<typename arg_info_t>
-int test(cl_device_id deviceID, cl_context context, kernel_args_t kernel_args, cl_uint lines_count, arg_info_t arg_info, size_t total_kernels_in_program) {
-
-    const size_t max_name_len = 512;
-    cl_char name[ max_name_len ];
-    cl_uint arg_count, numArgs;
-    size_t i, j, size;
-    int error;
-
-    clProgramWrapper program =
-    clCreateProgramWithSource(context, lines_count, kernel_args, NULL, &error);
-    if ( program == NULL || error != CL_SUCCESS )
-    {
-        print_error( error, "Unable to create required arguments kernel program" );
-        return -1;
-    }
-
-    // Compile the program
-    log_info( "Building kernels...\n" );
-    clBuildProgram( program, 1, &deviceID, "-cl-kernel-arg-info", NULL, NULL );
-
-    // check for build errors and exit if things didn't work
-    size_t size_ret;
-    cl_build_status build_status;
-    error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_STATUS, sizeof(build_status), &build_status, &size_ret);
-    test_error( error, "Unable to query build status" );
-    if (build_status == CL_BUILD_ERROR) {
-        printf("CL_PROGRAM_BUILD_STATUS=%d\n", (int) build_status);
-        error = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &size_ret);
-        test_error( error, "Unable to get build log size" );
-        char *build_log = (char *)malloc(size_ret);
-        error = clGetProgramBuildInfo(program,deviceID, CL_PROGRAM_BUILD_LOG, size_ret, build_log, &size_ret);
-        test_error( error, "Unable to get build log" );
-        printf("CL_PROGRAM_BUILD_LOG:\n%s\n", build_log);
-        printf("CL_BUILD_ERROR. exiting\n");
-        free(build_log);
-        return -1;
-    }
-
-    // Lookup the number of kernels in the program.
-    log_info( "Testing kernels...\n" );
-    size_t total_kernels = 0;
-    error = clGetProgramInfo( program, CL_PROGRAM_NUM_KERNELS, sizeof( size_t ), &total_kernels, NULL );
-    test_error( error, "Unable to get program info num kernels" );
-
-    if ( total_kernels != total_kernels_in_program )
-    {
-        print_error( error, "Program did not build all kernels" );
-        return -1;
-    }
-
-    // Lookup the kernel names.
-    size_t kernel_names_len = 0;
-    error = clGetProgramInfo( program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &kernel_names_len );
-    test_error( error, "Unable to get length of kernel names list." );
-
-    size_t expected_kernel_names_len = 0;
-    for ( i = 0; i < total_kernels; ++i )
-    {
-        expected_kernel_names_len += 1 + strlen( arg_info[ i ][ 0 ] );
-    }
-    if ( kernel_names_len != expected_kernel_names_len )
-    {
-        log_error( "Kernel names string is not the right length, expected %d, got %d\n", (int) expected_kernel_names_len, (int) kernel_names_len );
-        return -1;
-    }
-
-    const size_t len = ( kernel_names_len + 1 ) * sizeof( char );
-    char* kernel_names = (char*) malloc( len );
-    error = clGetProgramInfo( program, CL_PROGRAM_KERNEL_NAMES, len, kernel_names, &kernel_names_len );
-    test_error( error, "Unable to get kernel names list." );
-
-    // Check to see if the kernel name array is null terminated.
-    if ( kernel_names[ kernel_names_len - 1 ] != '\0' )
-    {
-        free( kernel_names );
-        print_error( error, "Kernel name list was not null terminated" );
-        return -1;
-    }
-
-    // Check to see if the correct kernel name string was returned.
-    // Does the string contain each expected kernel name?
-    for ( i = 0; i < total_kernels; ++i )
-        if ( !strstr( kernel_names, arg_info[ i ][ 0 ] ) )
-            break;
-    if ( i != total_kernels )
-    {
-        log_error( "Kernel names string is missing \"%s\"\n", arg_info[ i ][ 0 ] );
-        free( kernel_names );
-        return -1;
-    }
-
-    // Are the kernel names delimited by ';'?
-    if ( !strtok( kernel_names, ";" ) )
-    {
-        error = -1;
-    }
-    else
-    {
-        for ( i = 1; i < total_kernels; ++i )
-        {
-            if ( !strtok( NULL, ";" ) )
-            {
-                error = -1;
-            }
-        }
-    }
-    if ( error )
-    {
-        log_error( "Kernel names string was not properly delimited by ';'\n" );
-        free( kernel_names );
-        return -1;
-    }
-    free( kernel_names );
-
-    // Create kernel objects and query them.
-    int rc = 0;
-    for ( i = 0; i < total_kernels; ++i )
-    {
-        int kernel_rc = 0;
-        const char* kernel_name = arg_info[ i ][ 0 ];
-        clKernelWrapper kernel = clCreateKernel(program, kernel_name, &error);
-        if( kernel == NULL || error != CL_SUCCESS )
-        {
-            log_error( "ERROR: Could not get kernel: %s\n", kernel_name );
-            kernel_rc = -1;
-        }
-
-        if(kernel_rc == 0)
-        {
-            // Determine the expected number of arguments.
-            arg_count = 0;
-            while (arg_info[ i ][ (ARG_INFO_FIELD_COUNT * arg_count) + 1 ] != NULL)
-                ++arg_count;
-
-            // Try to get the number of arguments.
-            error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, 0, NULL, &size );
-            test_error( error, "Unable to get kernel arg count param size" );
-            if( size != sizeof( numArgs ) )
-            {
-                log_error( "ERROR: Kernel arg count param returns invalid size (expected %d, got %d) for kernel: %s\n", (int)sizeof( numArgs ), (int)size, kernel_name );
-                kernel_rc = -1;
-            }
-        }
-
-
-        if(kernel_rc == 0)
-        {
-            error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, sizeof( numArgs ), &numArgs, NULL );
-            test_error( error, "Unable to get kernel arg count" );
-            if( numArgs != arg_count )
-            {
-                log_error( "ERROR: Kernel arg count returned invalid value (expected %d, got %d) for kernel: %s\n", arg_count, numArgs, kernel_name );
-                kernel_rc = -1;
-            }
-        }
-
-        if(kernel_rc == 0)
-        {
-            for ( j = 0; j < numArgs; ++j )
-            {
-
-                int arg_rc = 0;
-                cl_kernel_arg_address_qualifier expected_address_qualifier = (cl_kernel_arg_address_qualifier)(uintptr_t)arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_ADDR_OFFSET ];
-                cl_kernel_arg_access_qualifier expected_access_qualifier =  (cl_kernel_arg_access_qualifier)(uintptr_t)arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_ACCESS_OFFSET ];
-                cl_kernel_arg_type_qualifier expected_type_qualifier = (cl_kernel_arg_type_qualifier)(uintptr_t)arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_TYPE_QUAL_OFFSET ];
-                const char* expected_type_name = arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_TYPE_NAME_OFFSET ];
-                const char* expected_arg_name = arg_info[ i ][ (ARG_INFO_FIELD_COUNT * j) + ARG_INFO_ARG_NAME_OFFSET ];
-
-                // Try to get the address qualifier of each argument.
-                cl_kernel_arg_address_qualifier address_qualifier = 0;
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_ADDRESS_QUALIFIER, sizeof address_qualifier, &address_qualifier, &size );
-                test_error( error, "Unable to get argument address qualifier" );
-                error = (address_qualifier != expected_address_qualifier);
-                if ( error )
-                {
-                    log_error( "ERROR: Bad address qualifier, kernel: \"%s\", argument number: %d, expected \"0x%X\", got \"0x%X\"\n", kernel_name, (unsigned int)j, (unsigned int)expected_address_qualifier, (unsigned int)address_qualifier );
-                    arg_rc = -1;
-                }
-
-                // Try to get the access qualifier of each argument.
-                cl_kernel_arg_access_qualifier access_qualifier = 0;
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_ACCESS_QUALIFIER, sizeof access_qualifier, &access_qualifier, &size );
-                test_error( error, "Unable to get argument access qualifier" );
-                error = (access_qualifier != expected_access_qualifier);
-                if ( error )
-                {
-                    log_error( "ERROR: Bad access qualifier, kernel: \"%s\", argument number: %d, expected \"0x%X\", got \"0x%X\"\n", kernel_name, (unsigned int)j, (unsigned int)expected_access_qualifier, (unsigned int)access_qualifier );
-                    arg_rc = -1;
-                }
-
-                // Try to get the type qualifier of each argument.
-                cl_kernel_arg_type_qualifier arg_type_qualifier = 0;
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_TYPE_QUALIFIER, sizeof arg_type_qualifier, &arg_type_qualifier, &size );
-                test_error( error, "Unable to get argument type qualifier" );
-                error = (arg_type_qualifier != expected_type_qualifier);
-                if ( error )
-                {
-                    log_error( "ERROR: Bad type qualifier, kernel: \"%s\", argument number: %d, expected \"0x%X\", got \"0x%X\"\n", kernel_name, (unsigned int)j, (unsigned int)expected_type_qualifier, (unsigned int)arg_type_qualifier );
-                    arg_rc = -1;
-                }
-
-                // Try to get the type of each argument.
-                memset( name, 0, max_name_len );
-                error = clGetKernelArgInfo(kernel, (cl_uint)j, CL_KERNEL_ARG_TYPE_NAME, max_name_len, name, &size );
-                test_error( error, "Unable to get argument type name" );
-                error = strcmp( (const char*) name, expected_type_name );
-                if ( error )
-                {
-                    log_error( "ERROR: Bad argument type name, kernel: \"%s\", argument number: %d, expected \"%s\", got \"%s\"\n", kernel_name, (unsigned int)j, expected_type_name, name );
-                    arg_rc = -1;
-                }
-
-                // Try to get the name of each argument.
-                memset( name, 0, max_name_len );
-                error = clGetKernelArgInfo( kernel, (cl_uint)j, CL_KERNEL_ARG_NAME, max_name_len, name, &size );
-                test_error( error, "Unable to get argument name" );
-                error = strcmp( (const char*) name, expected_arg_name );
-                if ( error )
-                {
-                    log_error( "ERROR: Bad argument name, kernel: \"%s\", argument number: %d, expected \"%s\", got \"%s\"\n", kernel_name, (unsigned int)j, expected_arg_name, name );
-                    arg_rc = -1;
-                }
-
-                if(arg_rc != 0) {
-                    kernel_rc = -1;
-                }
-            }
-        }
-
-        //log_info( "%s ... %s\n",arg_info[i][0],kernel_rc == 0 ? "passed" : "failed" );
-        if(kernel_rc != 0) {
-            rc = -1;
-        }
-    }
-  return rc;
-}
-
-
-int test_get_kernel_arg_info_compatibility( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
-{
-    size_t size;
-    int error;
-
-    cl_bool supports_double = 0; // assume not
-    cl_bool supports_half = 0; // assume not
-  cl_bool supports_images = 0; // assume not
-
-    // Check if this device supports images
-  error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_SUPPORT, sizeof supports_images, &supports_images, NULL);
-  test_error(error, "clGetDeviceInfo for CL_DEVICE_IMAGE_SUPPORT failed");
-
-  if (supports_images) {
-    log_info(" o Device supports images\n");
-    log_info(" o Expecting SUCCESS when testing image kernel arguments.\n");
-  }
-  else {
-    log_info(" o Device lacks image support\n");
-    log_info(" o Not testing image kernel arguments.\n");
-  }
-
-    if (is_extension_available(deviceID, "cl_khr_fp64")) {
-        log_info(" o Device claims extension 'cl_khr_fp64'\n");
-        log_info(" o Expecting SUCCESS when testing double kernel arguments.\n");
-        supports_double = 1;
-    } else {
-        cl_device_fp_config double_fp_config;
-        error = clGetDeviceInfo(deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(double_fp_config), &double_fp_config, NULL);
-        test_error(error, "clGetDeviceInfo for CL_DEVICE_DOUBLE_FP_CONFIG failed");
-        if (double_fp_config != 0)
-            supports_double = 1;
-        else {
-            log_info(" o Device lacks extension 'cl_khr_fp64'\n");
-            log_info(" o Not testing double kernel arguments.\n");
-            supports_double = 0;
-        }
-    }
-
-    if (is_extension_available(deviceID, "cl_khr_fp16")) {
-        log_info(" o Device claims extension 'cl_khr_fp16'\n");
-        log_info(" o Expecting SUCCESS when testing halfn* kernel arguments.\n");
-        supports_half = 1;
-    } else {
-        log_info(" o Device lacks extension 'cl_khr_fp16'\n");
-        log_info(" o Not testing halfn* kernel arguments.\n");
-        supports_half = 0;
-    }
-
-
-  int test_failed = 0;
-
-    // Now create a test program using required arguments
-  log_info("Testing required kernel arguments...\n");
-  error = test(deviceID, context, required_kernel_args, sizeof(required_kernel_args)/sizeof(required_kernel_args[0]), required_arg_info, sizeof(required_arg_info)/sizeof(required_arg_info[0]));
-  test_failed = (error) ? -1 : test_failed;
-
-  if ( supports_images ) {
-    log_info("Testing optional image arguments...\n");
-    error = test(deviceID, context, image_kernel_args, sizeof(image_kernel_args)/sizeof(image_kernel_args[0]), image_arg_info, sizeof(image_arg_info)/sizeof(image_arg_info[0]));
-    test_failed = (error) ? -1 : test_failed;
-  }
-
-    if ( supports_double ) {
-    log_info("Testing optional double arguments...\n");
-    error = test(deviceID, context, double_kernel_args, sizeof(double_kernel_args)/sizeof(double_kernel_args[0]), double_arg_info, sizeof(double_arg_info)/sizeof(double_arg_info[0]));
-    test_failed = (error) ? -1 : test_failed;
-  }
-
-    if ( supports_half ) {
-    log_info("Testing optional half arguments...\n");
-    error = test(deviceID, context, half_kernel_args, sizeof(half_kernel_args)/sizeof(half_kernel_args[0]), half_arg_info, sizeof(half_arg_info)/sizeof(half_arg_info[0]));
-    test_failed = (error) ? -1 : test_failed;
-  }
-
-    return test_failed;
-}
-
-

From 71e2681414901407548937b4b881f59a45b1c77c Mon Sep 17 00:00:00 2001
From: Chetan Mistry <70694498+chemis01@users.noreply.github.com>
Date: Thu, 13 May 2021 09:20:45 +0100
Subject: [PATCH 088/158] Add Test for CL_KERNEL_ATTRIBUTES (#832) (#1055)

* Improve Functionality of Harness

In the harness we previously were able to determine whether or
not a device supports the half or double data types, but doing so
required unintuitive function calls and would need to be repeated
per test.
A new pair of functions have been added which clearly state
what they do, and makes it easier to determine whether or not
a device supports the types.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* Add Test for CL_KERNEL_ATTRIBUTES (#832)

This test generates dummy kernels which have any
permutation combining the following attributes:

    * vec_type_hint
    * work_group_size_hint
    * reqd_work_group_size

It then gets the attributes by using clGetKernelInfo
and validates that the attributes returned are correct.
By matching the attributes which were used to generate
the kernel are present in the returned string from
clGetKernelInfo.
This test has been implemented as part of the
test_conformance/api suite.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Remove Signed Vector Attribute Hints

As per comments, SPIR-V does not distinguish the signedness
of an argument. This change removes the "signed" types
to ensure that the test passes in all scenarios.

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>

* [SQUASH] Add TODO for Signed Vector Hints

As the current version only tests for unsigned
vector types (uchar/uint/etc), add a TODO in the code
as a reference to future work to introduce signed vector
tests

Signed-off-by: Chetankumar Mistry <chetan.mistry@arm.com>
---
 test_conformance/api/CMakeLists.txt           |   1 +
 test_conformance/api/main.cpp                 |   1 +
 test_conformance/api/procs.h                  |   2 +
 .../api/test_kernel_attributes.cpp            | 339 ++++++++++++++++++
 4 files changed, 343 insertions(+)
 create mode 100644 test_conformance/api/test_kernel_attributes.cpp

diff --git a/test_conformance/api/CMakeLists.txt b/test_conformance/api/CMakeLists.txt
index 375e92d3d8..d3e6c6a7c4 100644
--- a/test_conformance/api/CMakeLists.txt
+++ b/test_conformance/api/CMakeLists.txt
@@ -14,6 +14,7 @@ set(${MODULE_NAME}_SOURCES
          test_api_min_max.cpp
          test_kernel_arg_changes.cpp
          test_kernel_arg_multi_setup.cpp
+         test_kernel_attributes.cpp
          test_binary.cpp
          test_native_kernel.cpp
          test_mem_objects.cpp
diff --git a/test_conformance/api/main.cpp b/test_conformance/api/main.cpp
index ef9f00cf2b..fa76a4064f 100644
--- a/test_conformance/api/main.cpp
+++ b/test_conformance/api/main.cpp
@@ -59,6 +59,7 @@ test_definition test_list[] = {
     ADD_TEST(set_kernel_arg_constant),
     ADD_TEST(set_kernel_arg_struct_array),
     ADD_TEST(kernel_global_constant),
+    ADD_TEST(kernel_attributes),
 
     ADD_TEST(min_max_thread_dimensions),
     ADD_TEST(min_max_work_items_sizes),
diff --git a/test_conformance/api/procs.h b/test_conformance/api/procs.h
index e9c45c360d..1bcb311626 100644
--- a/test_conformance/api/procs.h
+++ b/test_conformance/api/procs.h
@@ -202,3 +202,5 @@ extern int test_negative_get_platform_ids(cl_device_id deviceID,
                                           cl_context context,
                                           cl_command_queue queue,
                                           int num_elements);
+extern int test_kernel_attributes(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements);
diff --git a/test_conformance/api/test_kernel_attributes.cpp b/test_conformance/api/test_kernel_attributes.cpp
new file mode 100644
index 0000000000..2e4e0a7f19
--- /dev/null
+++ b/test_conformance/api/test_kernel_attributes.cpp
@@ -0,0 +1,339 @@
+//
+// Copyright (c) 2020 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <iostream>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "procs.h"
+#include "harness/errorHelpers.h"
+#include "harness/typeWrappers.h"
+#include "harness/parseParameters.h"
+
+using KernelAttributes = std::vector<std::string>;
+
+static std::string generate_kernel_source(const KernelAttributes& attributes)
+{
+    std::string kernel;
+    for (auto attribute : attributes)
+    {
+        kernel += "__attribute__((" + attribute + "))\n";
+    }
+    kernel += "__kernel void test_kernel(){}";
+    return kernel;
+}
+
+
+using AttributePermutations = std::vector<KernelAttributes>;
+
+// The following combinations have been chosen as they place each of the
+// attribute types in the different orders that they can occur. While distinct
+// permutations would provide a complete overview of the API the sheer number of
+// combinations increases the runtime of this test by an unreasonable amount
+AttributePermutations vect_tests;
+AttributePermutations work_tests;
+AttributePermutations reqd_tests;
+
+AttributePermutations vect_reqd_tests;
+AttributePermutations work_vect_tests;
+AttributePermutations reqd_work_tests;
+
+AttributePermutations vect_work_reqd_tests;
+AttributePermutations work_reqd_vect_tests;
+AttributePermutations reqd_vect_work_tests;
+
+
+// Generate a vector with vec_type_hint(<data_type>) so that it can be used to
+// generate different kernels
+static KernelAttributes generate_vec_type_hint_data(cl_device_id deviceID)
+{
+    KernelAttributes vec_type_hint_data;
+    // TODO Test for signed vectors (char/short/int/etc)
+    std::vector<std::string> vector_types = { "uchar", "ushort", "uint",
+                                              "float" };
+    if (gHasLong)
+    {
+        vector_types.push_back("ulong");
+    }
+    if (device_supports_half(deviceID))
+    {
+        vector_types.push_back("half");
+    }
+    if (device_supports_double(deviceID))
+    {
+        vector_types.push_back("double");
+    }
+
+    const auto vector_sizes = { "2", "3", "4", "8", "16" };
+    for (auto type : vector_types)
+    {
+        for (auto size : vector_sizes)
+        {
+            vec_type_hint_data.push_back("vec_type_hint(" + type + size + ")");
+        }
+    }
+    return vec_type_hint_data;
+}
+
+
+struct WorkGroupDimensions
+{
+    int x;
+    int y;
+    int z;
+};
+
+// Generate vectors to store reqd_work_group_size(<dimensions>) and
+// work_group_size_hint(<dimensions>) so that they can be used to generate
+// different kernels
+static KernelAttributes generate_reqd_work_group_size_data(
+    const std::vector<WorkGroupDimensions>& work_group_dimensions)
+{
+    KernelAttributes reqd_work_group_size_data;
+    for (auto dimension : work_group_dimensions)
+    {
+        reqd_work_group_size_data.push_back(
+            "reqd_work_group_size(" + std::to_string(dimension.x) + ","
+            + std::to_string(dimension.y) + "," + std::to_string(dimension.z)
+            + ")");
+    }
+    return reqd_work_group_size_data;
+}
+
+static KernelAttributes generate_work_group_size_data(
+    const std::vector<WorkGroupDimensions>& work_group_dimensions)
+{
+    KernelAttributes work_group_size_hint_data;
+    for (auto dimension : work_group_dimensions)
+    {
+        work_group_size_hint_data.push_back(
+            "work_group_size_hint(" + std::to_string(dimension.x) + ","
+            + std::to_string(dimension.y) + "," + std::to_string(dimension.z)
+            + ")");
+    }
+    return work_group_size_hint_data;
+}
+
+// Populate the Global Vectors which store individual Kernel Attributes
+static void populate_single_attribute_tests(
+    // Vectors to store the different data that fill the attributes
+    const KernelAttributes& vec_type_hint_data,
+    const KernelAttributes& work_group_size_hint_data,
+    const KernelAttributes& reqd_work_group_size_data)
+{
+    for (auto vector_test : vec_type_hint_data)
+    {
+        // Initialise vec_type_hint attribute tests
+        vect_tests.push_back({ vector_test });
+    }
+    for (auto work_group_test : work_group_size_hint_data)
+    {
+
+        // Initialise work_group_size_hint attribute test
+        work_tests.push_back({ work_group_test });
+    }
+    for (auto reqd_work_group_test : reqd_work_group_size_data)
+    {
+
+        // Initialise reqd_work_group_size attribute tests
+        reqd_tests.push_back({ reqd_work_group_test });
+    }
+}
+
+// Populate the Global Vectors which store the different permutations of 2
+// Kernel Attributes
+static void populate_double_attribute_tests(
+    const KernelAttributes& vec_type_hint_data,
+    const KernelAttributes& work_group_size_hint_data,
+    const KernelAttributes& reqd_work_group_size_data)
+{
+    for (auto vector_test : vec_type_hint_data)
+    {
+        for (auto work_group_test : work_group_size_hint_data)
+        {
+            // Initialise the tests for the permutation of work_group_size_hint
+            // combined with vec_type_hint
+            work_vect_tests.push_back({ work_group_test, vector_test });
+        }
+        for (auto reqd_work_group_test : reqd_work_group_size_data)
+        {
+            // Initialise the tests for the permutation of vec_type_hint and
+            // reqd_work_group_size
+            vect_reqd_tests.push_back({ vector_test, reqd_work_group_test });
+        }
+    }
+    for (auto work_group_test : work_group_size_hint_data)
+    {
+
+        for (auto reqd_work_group_test : reqd_work_group_size_data)
+        {
+            // Initialse the tests for the permutation of reqd_work_group_size
+            // and  work_group_size_hint
+            reqd_work_tests.push_back(
+                { reqd_work_group_test, work_group_test });
+        }
+    }
+}
+
+// Populate the Global Vectors which store the different permutations of 3
+// Kernel Attributes
+static void populate_triple_attribute_tests(
+    const KernelAttributes& vec_type_hint_data,
+    const KernelAttributes& work_group_size_hint_data,
+    const KernelAttributes& reqd_work_group_size_data)
+{
+    for (auto vector_test : vec_type_hint_data)
+    {
+        for (auto work_group_test : work_group_size_hint_data)
+        {
+            for (auto reqd_work_group_test : reqd_work_group_size_data)
+            {
+                //  Initialise the chosen permutations of 3 attributes
+                vect_work_reqd_tests.push_back(
+                    { vector_test, work_group_test, reqd_work_group_test });
+                work_reqd_vect_tests.push_back(
+                    { work_group_test, reqd_work_group_test, vector_test });
+                reqd_vect_work_tests.push_back(
+                    { reqd_work_group_test, vector_test, work_group_test });
+            }
+        }
+    }
+}
+
+static const std::vector<AttributePermutations*>
+generate_attribute_tests(const KernelAttributes& vec_type_hint_data,
+                         const KernelAttributes& work_group_size_hint_data,
+                         const KernelAttributes& reqd_work_group_size_data)
+{
+    populate_single_attribute_tests(vec_type_hint_data,
+                                    work_group_size_hint_data,
+                                    reqd_work_group_size_data);
+    populate_double_attribute_tests(vec_type_hint_data,
+                                    work_group_size_hint_data,
+                                    reqd_work_group_size_data);
+    populate_triple_attribute_tests(vec_type_hint_data,
+                                    work_group_size_hint_data,
+                                    reqd_work_group_size_data);
+
+    // Store all of the filled vectors in a single structure
+    const std::vector<AttributePermutations*> all_tests = {
+        &vect_tests,           &work_tests,           &reqd_tests,
+
+        &work_vect_tests,      &vect_reqd_tests,      &reqd_work_tests,
+
+        &vect_work_reqd_tests, &work_reqd_vect_tests, &reqd_vect_work_tests
+    };
+    return all_tests;
+}
+
+static const std::vector<AttributePermutations*>
+initialise_attribute_data(cl_device_id deviceID)
+{
+    // This vector stores different work group dimensions that can be used by
+    // the reqd_work_group_size and work_group_size_hint attributes. It
+    // currently only has a single value to minimise time complexity of the
+    // overall test but can be easily changed.
+    static const std::vector<WorkGroupDimensions> work_group_dimensions = {
+        { 1, 1, 1 }
+    };
+    KernelAttributes vec_type_hint_data = generate_vec_type_hint_data(deviceID);
+    KernelAttributes work_group_size_hint_data =
+        generate_work_group_size_data(work_group_dimensions);
+    KernelAttributes reqd_work_group_size_data =
+        generate_reqd_work_group_size_data(work_group_dimensions);
+
+    // Generate all the permutations of attributes to create different test
+    // suites
+    return generate_attribute_tests(vec_type_hint_data,
+                                    work_group_size_hint_data,
+                                    reqd_work_group_size_data);
+}
+
+static bool run_test(cl_context context, cl_device_id deviceID,
+                     const AttributePermutations& permutations)
+{
+    bool success = true;
+    for (auto attribute_permutation : permutations)
+    {
+
+        std::string kernel_source_string =
+            generate_kernel_source(attribute_permutation);
+        const char* kernel_src = kernel_source_string.c_str();
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        cl_int err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                                 &kernel_src, "test_kernel");
+        test_error(err, "create_single_kernel_helper");
+
+        // Get the size of the kernel attribute string returned
+        size_t size = 0;
+        err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, 0, nullptr, &size);
+        test_error(err, "clGetKernelInfo");
+        std::vector<char> attributes(size);
+        err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, attributes.size(),
+                              attributes.data(), nullptr);
+        test_error(err, "clGetKernelInfo");
+        std::string attribute_string(attributes.data());
+        attribute_string.erase(
+            std::remove(attribute_string.begin(), attribute_string.end(), ' '),
+            attribute_string.end());
+        if (gCompilationMode != kOnline)
+        {
+            if (!attribute_string.empty())
+            {
+                success = false;
+                log_error("Error: Expected an empty string\n");
+                log_error("Attribute string reported as: %s\n",
+                          attribute_string.c_str());
+            }
+        }
+        else
+        {
+            bool permutation_success = true;
+            for (auto attribute : attribute_permutation)
+            {
+                if (attribute_string.find(attribute) == std::string::npos)
+                {
+                    success = false;
+                    permutation_success = false;
+                    log_error("ERROR: did not find expected attribute: '%s'\n",
+                              attribute.c_str());
+                }
+            }
+            if (!permutation_success)
+            {
+                log_error("Attribute string reported as: %s\n",
+                          attribute_string.c_str());
+            }
+        }
+    }
+    return success;
+}
+
+int test_kernel_attributes(cl_device_id deviceID, cl_context context,
+                           cl_command_queue queue, int num_elements)
+{
+    bool success = true;
+
+    // Vector to store all of the tests
+    const std::vector<AttributePermutations*> all_tests =
+        initialise_attribute_data(deviceID);
+
+    for (auto permutations : all_tests)
+    {
+        success = success && run_test(context, deviceID, *permutations);
+    }
+    return success ? TEST_PASS : TEST_FAIL;
+}

From db939bbb20d179fbebd574a4b5a84f56f59ed934 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Thu, 13 May 2021 10:22:50 +0200
Subject: [PATCH 089/158] Fix test_buffer - undefined behavior in case of
 CL_MEM_USE_HOST_PTR (#1210)

---
 test_conformance/buffers/test_buffer_fill.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_conformance/buffers/test_buffer_fill.cpp b/test_conformance/buffers/test_buffer_fill.cpp
index 3edfafbc38..9c9c7d17ef 100644
--- a/test_conformance/buffers/test_buffer_fill.cpp
+++ b/test_conformance/buffers/test_buffer_fill.cpp
@@ -702,10 +702,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
 
 int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
 {
-    clMemWrapper buffers[2];
-    void        *outptr;
-    TestStruct  *inptr;
-    TestStruct  *hostptr;
     TestStruct pattern;
     clProgramWrapper program;
     clKernelWrapper kernel;
@@ -741,6 +737,10 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
         for (n = 0; n < 8; n++)
         {
             clEventWrapper event[2];
+            clMemWrapper buffers[2];
+            void *outptr;
+            TestStruct *inptr;
+            TestStruct *hostptr;
 
             offset_elements =
                 (size_t)get_random_float(0.f, (float)(num_elements - 8), d);

From 02c2a424f73d0bd3582eed45441c3a57f86627be Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 13 May 2021 09:31:13 +0100
Subject: [PATCH 090/158] Remove unnecessary code from unary_u_*.cpp (#1237)

Only nan() is tested by unary_u_float.cpp and unary_u_double.cpp.
Testing of half_sin, half_tan and half_cos is done in unary_float.cpp
and unary_double.cpp.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/unary_u_double.cpp       |  1 -
 .../math_brute_force/unary_u_float.cpp        | 31 -------------------
 2 files changed, 32 deletions(-)

diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index 940b0f884d..3c5f99da5f 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -243,7 +243,6 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
                     float err = Bruteforce_Ulp_Error_Double(test, correct);
                     int fail = !(fabsf(err) <= f->double_ulps);
 
-                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
                     if (fail)
                     {
                         if (ftz)
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 5c8f6ae6e6..44c5af473f 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -126,8 +126,6 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal = 0.0f;
     uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
     int scale = (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(double)) + 1);
-    int isRangeLimited = 0;
-    float half_sin_cos_tan_limit = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
@@ -147,22 +145,6 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
             return error;
     }
 
-    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
-    {
-        isRangeLimited = 1;
-        half_sin_cos_tan_limit = 1.0f
-            + float_ulps
-                * (FLT_EPSILON / 2.0f); // out of range results from finite
-                                        // inputs must be in [-1,1]
-    }
-    else if (0 == strcmp(f->name, "half_tan"))
-    {
-        isRangeLimited = 1;
-        half_sin_cos_tan_limit =
-            INFINITY; // out of range resut from finite inputs must be numeric
-    }
-
-
     for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
@@ -249,7 +231,6 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 
         if (gSkipCorrectnessTesting) break;
 
-
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
@@ -266,18 +247,6 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
                     float err = Ulp_Error(test, correct);
                     int fail = !(fabsf(err) <= float_ulps);
 
-                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                    if (isRangeLimited
-                        && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
-                        && fabsf(s[j]) < INFINITY)
-                    {
-                        if (fabsf(test) <= half_sin_cos_tan_limit)
-                        {
-                            err = 0;
-                            fail = 0;
-                        }
-                    }
-
                     if (fail)
                     {
                         if (ftz)

From e7c5694cf55bc9eba65a95f9c8a0a90434cd7abf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Fri, 14 May 2021 09:44:38 +0100
Subject: [PATCH 091/158] Fix image pixel reference calculation for
 CL_{INTENSITY,LUMINANCE} formats (#1247)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As per 6.15.15.7, the first three components have to be set to the luminance
value and all components to the intensity value.

Signed-off-by: Kévin Petit <kpet@free.fr>
---
 test_common/harness/imageHelpers.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h
index aba70508c8..848ec65574 100644
--- a/test_common/harness/imageHelpers.h
+++ b/test_common/harness/imageHelpers.h
@@ -489,12 +489,14 @@ void read_image_pixel(void *imageData, image_descriptor *imageInfo, int x,
     }
     else if (format->image_channel_order == CL_INTENSITY)
     {
+        outData[0] = tempData[0];
         outData[1] = tempData[0];
         outData[2] = tempData[0];
         outData[3] = tempData[0];
     }
     else if (format->image_channel_order == CL_LUMINANCE)
     {
+        outData[0] = tempData[0];
         outData[1] = tempData[0];
         outData[2] = tempData[0];
     }

From 657283799482433cc35d119ddab220dcad1f00aa Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierremoreau@users.noreply.github.com>
Date: Fri, 14 May 2021 10:44:51 +0200
Subject: [PATCH 092/158] buffers: Switch from enqueueing to enqueuing (#1246)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OpenCL-Docs has 45 occurrences of “enqueuing” but a single one of
“enqueueing”.
---
 test_conformance/buffers/test_buffer_migrate.cpp | 6 +++---
 test_conformance/buffers/test_image_migrate.cpp  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test_conformance/buffers/test_buffer_migrate.cpp b/test_conformance/buffers/test_buffer_migrate.cpp
index a5b6f26a89..f30983665b 100644
--- a/test_conformance/buffers/test_buffer_migrate.cpp
+++ b/test_conformance/buffers/test_buffer_migrate.cpp
@@ -295,9 +295,9 @@ int test_buffer_migrate(cl_device_id deviceID, cl_context context, cl_command_qu
             }
 
             if ((err = clEnqueueNDRangeKernel(queues[i], kernel, 1, NULL, wgs, NULL, 0, NULL, NULL)) != CL_SUCCESS) {
-              print_error(err, "Failed enqueueing the NDRange kernel.");
-              failed = 1;
-              goto cleanup;
+                print_error(err, "Failed enqueuing the NDRange kernel.");
+                failed = 1;
+                goto cleanup;
             }
           }
           // Verify the results as long as neither input is an undefined migration
diff --git a/test_conformance/buffers/test_image_migrate.cpp b/test_conformance/buffers/test_image_migrate.cpp
index 31bb0a2151..dbdca9cc8d 100644
--- a/test_conformance/buffers/test_image_migrate.cpp
+++ b/test_conformance/buffers/test_image_migrate.cpp
@@ -345,9 +345,9 @@ int test_image_migrate(cl_device_id deviceID, cl_context context, cl_command_que
             }
 
             if ((err = clEnqueueNDRangeKernel(queues[i], kernel, 2, NULL, wgs, wls, 0, NULL, NULL)) != CL_SUCCESS) {
-              print_error(err, "Failed enqueueing the NDRange kernel.");
-              failed = 1;
-              goto cleanup;
+                print_error(err, "Failed enqueuing the NDRange kernel.");
+                failed = 1;
+                goto cleanup;
             }
           }
           // Verify the results as long as neither input is an undefined migration

From 17a0d0956795b1805814066f861a37ea1a607d46 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Tue, 18 May 2021 18:09:46 +0100
Subject: [PATCH 093/158] Cleanup usage of static, extern and typedef (#1256)

* Cleanup usage of static, extern and typedef

Remove static on functions defined headers, as it can result in
duplication in binaries.

Remove unnecessary extern keyword on a function declaration, as it is
the default behavior and can be puzzling when reading the code.

Remove the unused declaration of my_ilogb, which is never defined.

Remove unnecessary usage of typedef, as they are only increasing the
cognitive load of the code for no purpose.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Improve usage of inline and static in harness

Functions declared in header as static can trigger unused warnings when
(indirectly) included in translation units that do not use such
functions. Use inline instead, which also avoids duplicating symbols in
binaries.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_common/harness/alloc.h                   |  4 +--
 test_common/harness/fpcontrol.h               |  6 ++--
 .../math_brute_force/function_list.h          | 16 ++++-----
 test_conformance/math_brute_force/main.cpp    |  1 -
 .../math_brute_force/reference_math.cpp       | 16 ++++-----
 test_conformance/math_brute_force/utility.h   | 33 +++++++++----------
 6 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/test_common/harness/alloc.h b/test_common/harness/alloc.h
index 653dde05f5..3b00d7c914 100644
--- a/test_common/harness/alloc.h
+++ b/test_common/harness/alloc.h
@@ -29,7 +29,7 @@
 #include "mingw_compat.h"
 #endif
 
-static void* align_malloc(size_t size, size_t alignment)
+inline void* align_malloc(size_t size, size_t alignment)
 {
 #if defined(_WIN32) && defined(_MSC_VER)
     return _aligned_malloc(size, alignment);
@@ -53,7 +53,7 @@ static void* align_malloc(size_t size, size_t alignment)
 #endif
 }
 
-static void align_free(void* ptr)
+inline void align_free(void* ptr)
 {
 #if defined(_WIN32) && defined(_MSC_VER)
     _aligned_free(ptr);
diff --git a/test_common/harness/fpcontrol.h b/test_common/harness/fpcontrol.h
index 40826c5c81..9f065044e4 100644
--- a/test_common/harness/fpcontrol.h
+++ b/test_common/harness/fpcontrol.h
@@ -39,7 +39,7 @@ typedef int FPU_mode_type;
 extern __thread fpu_control_t fpu_control;
 #endif
 // Set the reference hardware floating point unit to FTZ mode
-static inline void ForceFTZ(FPU_mode_type *mode)
+inline void ForceFTZ(FPU_mode_type *mode)
 {
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
@@ -65,7 +65,7 @@ static inline void ForceFTZ(FPU_mode_type *mode)
 }
 
 // Disable the denorm flush to zero
-static inline void DisableFTZ(FPU_mode_type *mode)
+inline void DisableFTZ(FPU_mode_type *mode)
 {
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
@@ -91,7 +91,7 @@ static inline void DisableFTZ(FPU_mode_type *mode)
 }
 
 // Restore the reference hardware to floating point state indicated by *mode
-static inline void RestoreFPState(FPU_mode_type *mode)
+inline void RestoreFPState(FPU_mode_type *mode)
 {
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h
index 38f739ce0e..95a2945932 100644
--- a/test_conformance/math_brute_force/function_list.h
+++ b/test_conformance/math_brute_force/function_list.h
@@ -30,7 +30,7 @@
 
 #include "harness/mt19937.h"
 
-typedef union fptr {
+union fptr {
     void *p;
     double (*f_f)(double);
     double (*f_u)(cl_uint);
@@ -45,9 +45,9 @@ typedef union fptr {
     double (*f_ffpI)(double, double, int *);
     double (*f_fff)(double, double, double);
     float (*f_fma)(float, float, float, int);
-} fptr;
+};
 
-typedef union dptr {
+union dptr {
     void *p;
     long double (*f_f)(long double);
     long double (*f_u)(cl_ulong);
@@ -59,20 +59,20 @@ typedef union dptr {
     long double (*f_fpI)(long double, int *);
     long double (*f_ffpI)(long double, long double, int *);
     long double (*f_fff)(long double, long double, long double);
-} dptr;
+};
 
 struct Func;
 
-typedef struct vtbl
+struct vtbl
 {
     const char *type_name;
     int (*TestFunc)(const struct Func *, MTdata, bool);
     int (*DoubleTestFunc)(
         const struct Func *, MTdata,
         bool); // may be NULL if function is single precision only
-} vtbl;
+};
 
-typedef struct Func
+struct Func
 {
     const char *name; // common name, to be used as an argument in the shell
     const char *nameInCode; // name as it appears in the __kernel, usually the
@@ -88,7 +88,7 @@ typedef struct Func
     int ftz;
     int relaxed;
     const vtbl *vtbl_ptr;
-} Func;
+};
 
 
 extern const Func functionList[];
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index d6c2f11ffb..e52f2f0a18 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -167,7 +167,6 @@ static int doTest(const char *name)
     }
 
     {
-        extern int my_ilogb(double);
         if (0 == strcmp("ilogb", func_data->name))
         {
             InitILogbConstants();
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 3a6516bae8..0b037e01d8 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -41,10 +41,10 @@
 #pragma STDC FP_CONTRACT OFF
 static void __log2_ep(double *hi, double *lo, double x);
 
-typedef union {
+union uint64d_t {
     uint64_t i;
     double d;
-} uint64d_t;
+};
 
 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
 
@@ -2259,10 +2259,10 @@ long double reference_dividel(long double x, long double y)
     return dx / dy;
 }
 
-typedef struct
+struct double_double
 {
     double hi, lo;
-} double_double;
+};
 
 // Split doubles_double into a series of consecutive 26-bit precise doubles and
 // a remainder. Note for later -- for multiplication, it might be better to
@@ -3767,10 +3767,10 @@ static uint32_t two_over_pi[] = {
 static uint32_t pi_over_two[] = { 0x1,        0x2487ed51, 0x42d1846,
                                   0x26263314, 0x1701b839, 0x28948127 };
 
-typedef union {
+union d_ui64_t {
     uint64_t u;
     double d;
-} d_ui64_t;
+};
 
 // radix or base of representation
 #define RADIX (30)
@@ -3786,13 +3786,13 @@ d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 };
 // extended fixed point representation of double precision
 // floating point number.
 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
-typedef struct
+struct eprep_t
 {
     uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
                    // base_30
     int index; // exponent bias
     int sign; // sign of double
-} eprep_t;
+};
 
 static eprep_t double_to_eprep(double x)
 {
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index ac4db9c8de..b4a59edb55 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -90,8 +90,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
                 bool relaxedMode);
 
 // used to convert a bucket of bits into a search pattern through double
-static inline double DoubleFromUInt32(uint32_t bits);
-static inline double DoubleFromUInt32(uint32_t bits)
+inline double DoubleFromUInt32(uint32_t bits)
 {
     union {
         uint64_t u;
@@ -117,25 +116,25 @@ void _LogBuildError(cl_program p, int line, const char *file);
 // premature flushing to zero.
 // However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +
 // ulp_limit to be flushed to zero.
-static inline int IsFloatResultSubnormal(double x, float ulps)
+inline int IsFloatResultSubnormal(double x, float ulps)
 {
     x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps;
     return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
-static inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
+inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
 {
     x = x - abs_err;
     return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
-static inline int IsDoubleResultSubnormal(long double x, float ulps)
+inline int IsDoubleResultSubnormal(long double x, float ulps)
 {
     x = fabsl(x) - MAKE_HEX_LONG(0x1.0p-1074, 0x1, -1074) * (long double)ulps;
     return x < MAKE_HEX_LONG(0x1.0p-1022, 0x1, -1022);
 }
 
-static inline int IsFloatInfinity(double x)
+inline int IsFloatInfinity(double x)
 {
     union {
         cl_float d;
@@ -145,7 +144,7 @@ static inline int IsFloatInfinity(double x)
     return ((u.u & 0x7fffffffU) == 0x7F800000U);
 }
 
-static inline int IsFloatMaxFloat(double x)
+inline int IsFloatMaxFloat(double x)
 {
     union {
         cl_float d;
@@ -155,7 +154,7 @@ static inline int IsFloatMaxFloat(double x)
     return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
 }
 
-static inline int IsFloatNaN(double x)
+inline int IsFloatNaN(double x)
 {
     union {
         cl_float d;
@@ -165,13 +164,13 @@ static inline int IsFloatNaN(double x)
     return ((u.u & 0x7fffffffU) > 0x7F800000U);
 }
 
-extern cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
 
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
 // convert long and ulong to float and double or otherwise deal with values
 // that need more precision than 53-bit. So, set the x87 to 64-bit precision.
-static inline void Force64BitFPUPrecision(void)
+inline void Force64BitFPUPrecision(void)
 {
 #if __MINGW32__
     // The usual method is to use _controlfp as follows:
@@ -202,17 +201,17 @@ static inline void Force64BitFPUPrecision(void)
 #endif
 }
 
-extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
+void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
 
-typedef union {
+union int32f_t {
     int32_t i;
     float f;
-} int32f_t;
+};
 
-typedef union {
+union int64d_t {
     int64_t l;
     double d;
-} int64d_t;
+};
 
 void MulD(double *rhi, double *rlo, double u, double v);
 void AddD(double *rhi, double *rlo, double a, double b);
@@ -229,7 +228,7 @@ void logFunctionInfo(const char *fname, unsigned int float_size,
 
 float getAllowedUlpError(const Func *f, const bool relaxed);
 
-static inline cl_uint getTestScale(size_t typeSize)
+inline cl_uint getTestScale(size_t typeSize)
 {
     if (gWimpyMode)
     {
@@ -245,7 +244,7 @@ static inline cl_uint getTestScale(size_t typeSize)
     }
 }
 
-static inline uint64_t getTestStep(size_t typeSize, size_t bufferSize)
+inline uint64_t getTestStep(size_t typeSize, size_t bufferSize)
 {
     if (gWimpyMode)
     {

From 6c8045911ab193143eae48eef68fc966d0d96b1f Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Tue, 18 May 2021 11:10:24 -0600
Subject: [PATCH 094/158] gles: Fix compile warnings. (#1070)

* gles: Fix compile warnings.

For 32 and 64-bit Visual Studio and the Android Q NDK.

* Fix formatting violations

Co-authored-by: spauls <spauls@qti.qualcomm.com>
---
 CMakeLists.txt                              |   4 -
 test_common/CMakeLists.txt                  |   5 +-
 test_common/gles/helpers.cpp                |   6 +-
 test_common/gles/helpers.h                  |   5 +-
 test_common/harness/ThreadPool.cpp          |  11 +-
 test_common/harness/compat.h                |   4 +-
 test_common/harness/conversions.cpp         |   4 +-
 test_common/harness/errorHelpers.cpp        |   6 +-
 test_common/harness/errorHelpers.h          |   5 -
 test_common/harness/fpcontrol.h             |   8 +-
 test_common/harness/imageHelpers.cpp        | 128 +++++++++++---------
 test_common/harness/kernelHelpers.cpp       |   4 +-
 test_common/harness/os_helpers.cpp          |   3 +-
 test_common/harness/propertyHelpers.cpp     |  11 +-
 test_common/harness/rounding_mode.cpp       |  10 +-
 test_common/harness/rounding_mode.h         |   2 -
 test_common/harness/threadTesting.cpp       |  98 ---------------
 test_common/harness/threadTesting.h         |   5 +-
 test_conformance/gles/CMakeLists.txt        |   8 ++
 test_conformance/gles/setup_egl.cpp         |   5 +-
 test_conformance/gles/test_fence_sync.cpp   |  10 +-
 test_conformance/gles/test_images_2D.cpp    |   2 +
 test_conformance/gles/test_renderbuffer.cpp |   2 +
 23 files changed, 136 insertions(+), 210 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 083ea96d5f..5b1f48fd86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,10 +152,6 @@ if(LINK_PTHREAD)
     list(APPEND CLConform_LIBRARIES pthread)
 endif()
 
-if(DEFINED USE_GLES3)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGLES3")
-endif()
-
 if(APPLE)
     find_library(corefoundation CoreFoundation)
     find_library(iokit IOKit)
diff --git a/test_common/CMakeLists.txt b/test_common/CMakeLists.txt
index 2d4bc19091..61580300b6 100644
--- a/test_common/CMakeLists.txt
+++ b/test_common/CMakeLists.txt
@@ -1,6 +1,5 @@
 
 set(HARNESS_SOURCES
-    harness/threadTesting.cpp
     harness/typeWrappers.cpp
     harness/mt19937.cpp
     harness/conversions.cpp
@@ -23,3 +22,7 @@ set(HARNESS_SOURCES
 
 add_library(harness STATIC ${HARNESS_SOURCES})
 
+if(MSVC)
+    # Don't warn about using the portable "strdup" function.
+    target_compile_definitions(harness PRIVATE _CRT_NONSTDC_NO_DEPRECATE)
+endif()
\ No newline at end of file
diff --git a/test_common/gles/helpers.cpp b/test_common/gles/helpers.cpp
index 34f40b4c3b..57a4ddc179 100644
--- a/test_common/gles/helpers.cpp
+++ b/test_common/gles/helpers.cpp
@@ -22,7 +22,7 @@
     {GLint __error = glGetError(); if(__error) {log_error( "GL ERROR: %s!\n", gluErrorString( err ));}}
 
 #if defined(__linux__) || defined(GL_ES_VERSION_2_0)
-// On linux we dont link to GLU library to avoid comaptibility issues with
+// On linux we don't link to GLU library to avoid compatibility issues with
 // libstdc++
 // FIXME: Implement this
 const GLubyte* gluErrorString (GLenum error)
@@ -271,8 +271,6 @@ void * ReadGLTexture( GLenum glTarget, GLuint glTexture,
     // Read results from the GL texture
     glBindTexture(get_base_gl_target(glTarget), glTexture);
 
-    GLint realWidth, realHeight;
-    GLint realInternalFormat;
     GLenum readBackFormat = GL_RGBA;
     GLenum readBackType = glType;
     glFramebufferWrapper glFramebuffer;
@@ -301,7 +299,7 @@ void * ReadGLTexture( GLenum glTarget, GLuint glTexture,
         GetGLFormatName(readBackFormat),
         GetGLTypeName(readBackType));
 
-    DumpGLBuffer(readBackType, realWidth, realHeight, (void*)outBuffer);
+    DumpGLBuffer(readBackType, outWidth, outHeight, (void *)outBuffer);
 
 #endif
 
diff --git a/test_common/gles/helpers.h b/test_common/gles/helpers.h
index 5bd0fdf1f9..207687875c 100644
--- a/test_common/gles/helpers.h
+++ b/test_common/gles/helpers.h
@@ -30,11 +30,10 @@
 
 #if !defined (__APPLE__)
 #include <CL/cl.h>
-#include "gl_headers.h"
 #include <CL/cl_gl.h>
-#else
-#include "gl_headers.h"
+#include <CL/cl_half.h>
 #endif
+#include "gl_headers.h"
 
 #include "harness/errorHelpers.h"
 #include "harness/kernelHelpers.h"
diff --git a/test_common/harness/ThreadPool.cpp b/test_common/harness/ThreadPool.cpp
index 31985aa090..5dae1b4aa3 100644
--- a/test_common/harness/ThreadPool.cpp
+++ b/test_common/harness/ThreadPool.cpp
@@ -523,7 +523,7 @@ void ThreadPool_Init(void)
                     {
                         // Count the number of bits in ProcessorMask (number of
                         // logical cores)
-                        ULONG mask = ptr->ProcessorMask;
+                        ULONG_PTR mask = ptr->ProcessorMask;
                         while (mask)
                         {
                             ++gThreadCount;
@@ -688,7 +688,10 @@ static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter,
 
 void ThreadPool_Exit(void)
 {
-    int err, count;
+#ifndef _WIN32
+    int err;
+#endif
+    int count;
     gRunCount = CL_INT_MAX;
 
 #if defined(__GNUC__)
@@ -738,7 +741,9 @@ void ThreadPool_Exit(void)
 // all available then it would make more sense to use those features.
 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
 {
+#ifndef _WIN32
     cl_int newErr;
+#endif
     cl_int err = 0;
     // Lazily set up our threads
 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
@@ -913,7 +918,9 @@ cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
 
     err = jobError;
 
+#ifndef _WIN32
 exit:
+#endif
     // exit critical region
 #if defined(_WIN32)
     LeaveCriticalSection(gThreadPoolLock);
diff --git a/test_common/harness/compat.h b/test_common/harness/compat.h
index 7aad15a09b..3b55785269 100644
--- a/test_common/harness/compat.h
+++ b/test_common/harness/compat.h
@@ -18,13 +18,13 @@
 
 #if defined(_WIN32) && defined(_MSC_VER)
 #include <Windows.h>
-#endif
-
+#else
 #ifdef __cplusplus
 #define EXTERN_C extern "C"
 #else
 #define EXTERN_C
 #endif
+#endif
 
 
 //
diff --git a/test_common/harness/conversions.cpp b/test_common/harness/conversions.cpp
index fc3317c7d7..c773126930 100644
--- a/test_common/harness/conversions.cpp
+++ b/test_common/harness/conversions.cpp
@@ -181,8 +181,8 @@ static ULong sUpperLimits[kNumExplicitTypes] = {
     0xffffffffLL,
     0xffffffffLL,
     0x7fffffffffffffffLL,
-    0xffffffffffffffffLL,
-    0xffffffffffffffffLL,
+    0xffffffffffffffffULL,
+    0xffffffffffffffffULL,
     0,
     0
 }; // Last two values aren't stored here
diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index 22a2677d04..3ddbc37bcf 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -564,7 +564,7 @@ cl_int OutputBuildLogs(cl_program program, cl_uint num_devices,
             error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL,
                                      &size_ret);
             test_error(error, "Unable to query context's device size");
-            num_devices = size_ret / sizeof(cl_device_id);
+            num_devices = static_cast<cl_uint>(size_ret / sizeof(cl_device_id));
             device_list = (cl_device_id *)malloc(size_ret);
             if (device_list == NULL)
             {
@@ -695,7 +695,7 @@ int check_functions_for_offline_compiler(const char *subtestname,
 {
     if (gCompilationMode != kOnline)
     {
-        int nNotRequiredWithOfflineCompiler =
+        size_t nNotRequiredWithOfflineCompiler =
             sizeof(subtests_to_skip_with_offline_compiler) / sizeof(char *);
         size_t i;
         for (i = 0; i < nNotRequiredWithOfflineCompiler; ++i)
@@ -707,4 +707,4 @@ int check_functions_for_offline_compiler(const char *subtestname,
         }
     }
     return 0;
-}
+}
\ No newline at end of file
diff --git a/test_common/harness/errorHelpers.h b/test_common/harness/errorHelpers.h
index 1944601467..c7f49e3dd7 100644
--- a/test_common/harness/errorHelpers.h
+++ b/test_common/harness/errorHelpers.h
@@ -56,11 +56,6 @@ static int vlog_win32(const char *format, ...);
 #define vlog printf
 #endif
 
-#define ct_assert(b) ct_assert_i(b, __LINE__)
-#define ct_assert_i(b, line) ct_assert_ii(b, line)
-#define ct_assert_ii(b, line)                                                  \
-    int _compile_time_assertion_on_line_##line[b ? 1 : -1];
-
 #define test_fail(msg, ...)                                                    \
     {                                                                          \
         log_error(msg, ##__VA_ARGS__);                                         \
diff --git a/test_common/harness/fpcontrol.h b/test_common/harness/fpcontrol.h
index 9f065044e4..2add9baf97 100644
--- a/test_common/harness/fpcontrol.h
+++ b/test_common/harness/fpcontrol.h
@@ -30,7 +30,11 @@
 // that rounding mode.
 #if defined(__APPLE__) || defined(_MSC_VER) || defined(__linux__)              \
     || defined(__MINGW32__)
+#ifdef _MSC_VER
 typedef int FPU_mode_type;
+#else
+typedef int64_t FPU_mode_type;
+#endif
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)              \
     || defined(__MINGW32__)
 #include <xmmintrin.h>
@@ -55,7 +59,7 @@ inline void ForceFTZ(FPU_mode_type *mode)
     __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr | (1U << 24)));
     // Add 64 bit support
 #elif defined(__aarch64__)
-    unsigned fpscr;
+    uint64_t fpscr;
     __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr));
     *mode = fpscr;
     __asm__ volatile("msr fpcr, %0" ::"r"(fpscr | (1U << 24)));
@@ -81,7 +85,7 @@ inline void DisableFTZ(FPU_mode_type *mode)
     __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr & ~(1U << 24)));
     // Add 64 bit support
 #elif defined(__aarch64__)
-    unsigned fpscr;
+    uint64_t fpscr;
     __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr));
     *mode = fpscr;
     __asm__ volatile("msr fpcr, %0" ::"r"(fpscr & ~(1U << 24)));
diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index 72a2f0c03c..d1754653e4 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -554,8 +554,8 @@ struct AddressingTable
 {
     AddressingTable()
     {
-        ct_assert((CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6));
-        ct_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2);
+        static_assert(CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6, "");
+        static_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2, "");
 
         mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE]
               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = NoAddressFn;
@@ -719,7 +719,7 @@ void get_max_sizes(
     if (usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16;
     size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;
 
-    log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, "
+    log_info("Maximums: [%zu x %zu x %zu], raw pixel size %zu bytes, "
              "per-allocation limit %gMB.\n",
              maxWidth, maxHeight, isArray ? maxArraySize : maxDepth,
              raw_pixel_size, (maxAllocSize / (1024.0 * 1024.0)));
@@ -760,10 +760,10 @@ void get_max_sizes(
     if (image_type == CL_MEM_OBJECT_IMAGE1D)
     {
 
-        double M = maximum_sizes[0];
+        size_t M = maximum_sizes[0];
 
         // Store the size
-        sizes[(*numberOfSizes)][0] = (size_t)M;
+        sizes[(*numberOfSizes)][0] = M;
         sizes[(*numberOfSizes)][1] = 1;
         sizes[(*numberOfSizes)][2] = 1;
         ++(*numberOfSizes);
@@ -777,17 +777,17 @@ void get_max_sizes(
         {
 
             // Determine the size of the fixed dimension
-            double M = maximum_sizes[fixed_dim];
-            double A = max_pixels;
+            size_t M = maximum_sizes[fixed_dim];
+            size_t A = max_pixels;
 
             int x0_dim = !fixed_dim;
-            double x0 =
+            size_t x0 = static_cast<size_t>(
                 fmin(fmin(other_sizes[(other_size++) % num_other_sizes], A / M),
-                     maximum_sizes[x0_dim]);
+                     maximum_sizes[x0_dim]));
 
             // Store the size
-            sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
-            sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
+            sizes[(*numberOfSizes)][fixed_dim] = M;
+            sizes[(*numberOfSizes)][x0_dim] = x0;
             sizes[(*numberOfSizes)][2] = 1;
             ++(*numberOfSizes);
         }
@@ -802,16 +802,17 @@ void get_max_sizes(
         {
 
             // Determine the size of the fixed dimension
-            double M = maximum_sizes[fixed_dim];
-            double A = max_pixels;
+            size_t M = maximum_sizes[fixed_dim];
+            size_t A = max_pixels;
 
             // Find two other dimensions, x0 and x1
             int x0_dim = (fixed_dim == 0) ? 1 : 0;
             int x1_dim = (fixed_dim == 2) ? 1 : 2;
 
             // Choose two other sizes for these dimensions
-            double x0 = fmin(fmin(A / M, maximum_sizes[x0_dim]),
-                             other_sizes[(other_size++) % num_other_sizes]);
+            size_t x0 = static_cast<size_t>(
+                fmin(fmin(A / M, maximum_sizes[x0_dim]),
+                     other_sizes[(other_size++) % num_other_sizes]));
             // GPUs have certain restrictions on minimum width (row alignment)
             // of images which has given us issues testing small widths in this
             // test (say we set width to 3 for testing, and compute size based
@@ -820,8 +821,9 @@ void get_max_sizes(
             // width of 16 which doesnt fit in vram). For this purpose we are
             // not testing width < 16 for this test.
             if (x0_dim == 0 && x0 < 16) x0 = 16;
-            double x1 = fmin(fmin(A / M / x0, maximum_sizes[x1_dim]),
-                             other_sizes[(other_size++) % num_other_sizes]);
+            size_t x1 = static_cast<size_t>(
+                fmin(fmin(A / M / x0, maximum_sizes[x1_dim]),
+                     other_sizes[(other_size++) % num_other_sizes]));
 
             // Valid image sizes cannot be below 1. Due to the workaround for
             // the xo_dim where x0 is overidden to 16 there might not be enough
@@ -834,9 +836,9 @@ void get_max_sizes(
             assert(x0 > 0 && M > 0);
 
             // Store the size
-            sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
-            sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
-            sizes[(*numberOfSizes)][x1_dim] = (size_t)x1;
+            sizes[(*numberOfSizes)][fixed_dim] = M;
+            sizes[(*numberOfSizes)][x0_dim] = x0;
+            sizes[(*numberOfSizes)][x1_dim] = x1;
             ++(*numberOfSizes);
         }
     }
@@ -847,20 +849,20 @@ void get_max_sizes(
         switch (image_type)
         {
             case CL_MEM_OBJECT_IMAGE1D:
-                log_info(" size[%d] = [%ld] (%g MB image)\n", j, sizes[j][0],
+                log_info(" size[%d] = [%zu] (%g MB image)\n", j, sizes[j][0],
                          raw_pixel_size * sizes[j][0] * sizes[j][1]
                              * sizes[j][2] / (1024.0 * 1024.0));
                 break;
             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
             case CL_MEM_OBJECT_IMAGE2D:
-                log_info(" size[%d] = [%ld %ld] (%g MB image)\n", j,
+                log_info(" size[%d] = [%zu %zu] (%g MB image)\n", j,
                          sizes[j][0], sizes[j][1],
                          raw_pixel_size * sizes[j][0] * sizes[j][1]
                              * sizes[j][2] / (1024.0 * 1024.0));
                 break;
             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
             case CL_MEM_OBJECT_IMAGE3D:
-                log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n", j,
+                log_info(" size[%d] = [%zu %zu %zu] (%g MB image)\n", j,
                          sizes[j][0], sizes[j][1], sizes[j][2],
                          raw_pixel_size * sizes[j][0] * sizes[j][1]
                              * sizes[j][2] / (1024.0 * 1024.0));
@@ -1124,12 +1126,13 @@ void escape_inf_nan_values(char *data, size_t allocSize)
 char *generate_random_image_data(image_descriptor *imageInfo,
                                  BufferOwningPtr<char> &P, MTdata d)
 {
-    size_t allocSize = get_image_size(imageInfo);
+    size_t allocSize = static_cast<size_t>(get_image_size(imageInfo));
     size_t pixelRowBytes = imageInfo->width * get_pixel_size(imageInfo->format);
     size_t i;
 
     if (imageInfo->num_mip_levels > 1)
-        allocSize = compute_mipmapped_image_size(*imageInfo);
+        allocSize =
+            static_cast<size_t>(compute_mipmapped_image_size(*imageInfo));
 
 #if defined(__APPLE__)
     char *data = NULL;
@@ -1161,7 +1164,7 @@ char *generate_random_image_data(image_descriptor *imageInfo,
 
     if (data == NULL)
     {
-        log_error("ERROR: Unable to malloc %lu bytes for "
+        log_error("ERROR: Unable to malloc %zu bytes for "
                   "generate_random_image_data\n",
                   allocSize);
         return 0;
@@ -1678,24 +1681,26 @@ bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset,
 
     // At this point, we're dealing with non-normalized coordinates.
 
-    outX = adFn(floorf(x), width);
+    outX = adFn(static_cast<int>(floorf(x)), width);
 
     // 1D and 2D arrays require special care for the index coordinate:
 
     switch (imageInfo->type)
     {
         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-            outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f);
-            outZ = 0.0f; /* don't care! */
+            outY = static_cast<int>(
+                calculate_array_index(y, (float)imageInfo->arraySize - 1.0f));
+            outZ = 0; /* don't care! */
             break;
         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-            outY = adFn(floorf(y), height);
-            outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f);
+            outY = adFn(static_cast<int>(floorf(y)), height);
+            outZ = static_cast<int>(
+                calculate_array_index(z, (float)imageInfo->arraySize - 1.0f));
             break;
         default:
             // legacy path:
-            if (height != 0) outY = adFn(floorf(y), height);
-            if (depth != 0) outZ = adFn(floorf(z), depth);
+            if (height != 0) outY = adFn(static_cast<int>(floorf(y)), height);
+            if (depth != 0) outZ = adFn(static_cast<int>(floorf(z)), depth);
     }
 
     return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ);
@@ -1766,7 +1771,7 @@ static float unnormalize_coordinate(const char *name, float coord, float offset,
     switch (addressing_mode)
     {
         case CL_ADDRESS_REPEAT:
-            ret = RepeatNormalizedAddressFn(coord, extent);
+            ret = RepeatNormalizedAddressFn(coord, static_cast<size_t>(extent));
 
             if (verbose)
             {
@@ -1790,7 +1795,8 @@ static float unnormalize_coordinate(const char *name, float coord, float offset,
             break;
 
         case CL_ADDRESS_MIRRORED_REPEAT:
-            ret = MirroredRepeatNormalizedAddressFn(coord, extent);
+            ret = MirroredRepeatNormalizedAddressFn(
+                coord, static_cast<size_t>(extent));
 
             if (verbose)
             {
@@ -1968,13 +1974,13 @@ FloatPixel sample_image_pixel_float_offset(
         // coordinates.  Note that the array cases again require special
         // care, per section 8.4 in the OpenCL 1.2 Specification.
 
-        ix = adFn(floorf(x), width_lod);
+        ix = adFn(static_cast<int>(floorf(x)), width_lod);
 
         switch (imageInfo->type)
         {
             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-                iy =
-                    calculate_array_index(y, (float)(imageInfo->arraySize - 1));
+                iy = static_cast<int>(calculate_array_index(
+                    y, (float)(imageInfo->arraySize - 1)));
                 iz = 0;
                 if (verbose)
                 {
@@ -1982,18 +1988,18 @@ FloatPixel sample_image_pixel_float_offset(
                 }
                 break;
             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-                iy = adFn(floorf(y), height_lod);
-                iz =
-                    calculate_array_index(z, (float)(imageInfo->arraySize - 1));
+                iy = adFn(static_cast<int>(floorf(y)), height_lod);
+                iz = static_cast<int>(calculate_array_index(
+                    z, (float)(imageInfo->arraySize - 1)));
                 if (verbose)
                 {
                     log_info("\tArray index %f evaluates to %d\n", z, iz);
                 }
                 break;
             default:
-                iy = adFn(floorf(y), height_lod);
+                iy = adFn(static_cast<int>(floorf(y)), height_lod);
                 if (depth_lod != 0)
-                    iz = adFn(floorf(z), depth_lod);
+                    iz = adFn(static_cast<int>(floorf(z)), depth_lod);
                 else
                     iz = 0;
         }
@@ -2047,16 +2053,16 @@ FloatPixel sample_image_pixel_float_offset(
                 height = 1;
             }
 
-            int x1 = adFn(floorf(x - 0.5f), width);
+            int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width);
             int y1 = 0;
-            int x2 = adFn(floorf(x - 0.5f) + 1, width);
+            int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width);
             int y2 = 0;
             if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D)
                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER))
             {
-                y1 = adFn(floorf(y - 0.5f), height);
-                y2 = adFn(floorf(y - 0.5f) + 1, height);
+                y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height);
+                y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height);
             }
             else
             {
@@ -2147,12 +2153,12 @@ FloatPixel sample_image_pixel_float_offset(
         else
         {
             // 3D linear filtering
-            int x1 = adFn(floorf(x - 0.5f), width_lod);
-            int y1 = adFn(floorf(y - 0.5f), height_lod);
-            int z1 = adFn(floorf(z - 0.5f), depth_lod);
-            int x2 = adFn(floorf(x - 0.5f) + 1, width_lod);
-            int y2 = adFn(floorf(y - 0.5f) + 1, height_lod);
-            int z2 = adFn(floorf(z - 0.5f) + 1, depth_lod);
+            int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width_lod);
+            int y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height_lod);
+            int z1 = adFn(static_cast<int>(floorf(z - 0.5f)), depth_lod);
+            int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width_lod);
+            int y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height_lod);
+            int z2 = adFn(static_cast<int>(floorf(z - 0.5f) + 1), depth_lod);
 
             if (verbose)
                 log_info("\tActual integer coords used (i = floor(x-.5)): "
@@ -2899,15 +2905,18 @@ void pack_image_pixel_error(const float *srcVector,
         case CL_UNSIGNED_INT8: {
             const cl_uchar *ptr = (const cl_uchar *)results;
             for (unsigned int i = 0; i < channelCount; i++)
-                errors[i] = (cl_int)ptr[i]
-                    - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX);
+                errors[i] = static_cast<float>(
+                    (cl_int)ptr[i]
+                    - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX));
             break;
         }
         case CL_UNSIGNED_INT16: {
             const cl_ushort *ptr = (const cl_ushort *)results;
             for (unsigned int i = 0; i < channelCount; i++)
-                errors[i] = (cl_int)ptr[i]
-                    - (cl_int)CONVERT_UINT(srcVector[i], 32767.f, CL_USHRT_MAX);
+                errors[i] = static_cast<float>(
+                    (cl_int)ptr[i]
+                    - (cl_int)CONVERT_UINT(srcVector[i], 32767.f,
+                                           CL_USHRT_MAX));
             break;
         }
         case CL_UNSIGNED_INT32: {
@@ -3228,7 +3237,7 @@ char *create_random_image_data(ExplicitType dataType,
     if (data == NULL)
     {
         log_error(
-            "ERROR: Unable to malloc %lu bytes for create_random_image_data\n",
+            "ERROR: Unable to malloc %zu bytes for create_random_image_data\n",
             allocSize);
         return NULL;
     }
@@ -3988,7 +3997,8 @@ bool is_image_format_required(cl_image_format format, cl_mem_flags flags,
 
 cl_uint compute_max_mip_levels(size_t width, size_t height, size_t depth)
 {
-    cl_uint retMaxMipLevels = 0, max_dim = 0;
+    cl_uint retMaxMipLevels = 0;
+    size_t max_dim = 0;
 
     max_dim = width;
     max_dim = height > max_dim ? height : max_dim;
diff --git a/test_common/harness/kernelHelpers.cpp b/test_common/harness/kernelHelpers.cpp
index 95b9555ed3..aaf0d689ca 100644
--- a/test_common/harness/kernelHelpers.cpp
+++ b/test_common/harness/kernelHelpers.cpp
@@ -579,7 +579,7 @@ static int create_single_kernel_helper_create_program_offline(
     if (error != CL_SUCCESS) return error;
 
     ifs.seekg(0, ifs.end);
-    int length = ifs.tellg();
+    size_t length = static_cast<size_t>(ifs.tellg());
     ifs.seekg(0, ifs.beg);
 
     // treat modifiedProgram as input for clCreateProgramWithBinary
@@ -1226,7 +1226,7 @@ int is_image_format_supported(cl_context context, cl_mem_flags flags,
     list = (cl_image_format *)malloc(count * sizeof(cl_image_format));
     if (NULL == list)
     {
-        log_error("Error: unable to allocate %ld byte buffer for image format "
+        log_error("Error: unable to allocate %zu byte buffer for image format "
                   "list at %s:%d (err = %d)\n",
                   count * sizeof(cl_image_format), __FILE__, __LINE__, err);
         return 0;
diff --git a/test_common/harness/os_helpers.cpp b/test_common/harness/os_helpers.cpp
index cd350cf831..daf2195851 100644
--- a/test_common/harness/os_helpers.cpp
+++ b/test_common/harness/os_helpers.cpp
@@ -404,7 +404,8 @@ std::string exe_path()
     for (;;)
     {
 
-        DWORD len = GetModuleFileNameA(NULL, &path.front(), path.size());
+        DWORD len = GetModuleFileNameA(NULL, &path.front(),
+                                       static_cast<DWORD>(path.size()));
 
         if (len == 0)
         {
diff --git a/test_common/harness/propertyHelpers.cpp b/test_common/harness/propertyHelpers.cpp
index 3157ca8091..e368f9b61b 100644
--- a/test_common/harness/propertyHelpers.cpp
+++ b/test_common/harness/propertyHelpers.cpp
@@ -97,15 +97,16 @@ int compareProperties(const std::vector<cl_properties>& queried,
 
             if (!found)
             {
-                log_error("ERROR: expected property 0x%x not found!\n",
+                log_error("ERROR: expected property 0x%llx not found!\n",
                           check_prop);
                 return TEST_FAIL;
             }
             else if (check_value != queried_value)
             {
-                log_error("ERROR: mis-matched value for property 0x%x: wanted "
-                          "0x%x, got 0x%x\n",
-                          check_prop, check_value, queried_value);
+                log_error(
+                    "ERROR: mis-matched value for property 0x%llx: wanted "
+                    "0x%llx, got 0x%llx\n",
+                    check_prop, check_value, queried_value);
                 return TEST_FAIL;
             }
         }
@@ -113,7 +114,7 @@ int compareProperties(const std::vector<cl_properties>& queried,
         if (queried.size() > check.size())
         {
             log_error("ERROR: all properties found but there are extra "
-                      "properties: expected %d, got %d.\n",
+                      "properties: expected %zu, got %zu.\n",
                       check.size(), queried.size());
             return TEST_FAIL;
         }
diff --git a/test_common/harness/rounding_mode.cpp b/test_common/harness/rounding_mode.cpp
index 681ccdd8ad..1f531478cf 100644
--- a/test_common/harness/rounding_mode.cpp
+++ b/test_common/harness/rounding_mode.cpp
@@ -48,7 +48,7 @@ RoundingMode set_round(RoundingMode r, Type outType)
     const int *p = int_rounds;
     if (outType == kfloat || outType == kdouble) p = flt_rounds;
 
-    int fpscr = 0;
+    int64_t fpscr = 0;
     RoundingMode oldRound = get_round();
 
     _FPU_GETCW(fpscr);
@@ -59,7 +59,7 @@ RoundingMode set_round(RoundingMode r, Type outType)
 
 RoundingMode get_round(void)
 {
-    int fpscr;
+    int64_t fpscr;
     int oldRound;
 
     _FPU_GETCW(fpscr);
@@ -203,13 +203,13 @@ void *FlushToZero(void)
 #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
     union {
-        int i;
+        unsigned int i;
         void *p;
     } u = { _mm_getcsr() };
     _mm_setcsr(u.i | 0x8040);
     return u.p;
 #elif defined(__arm__) || defined(__aarch64__)
-    int fpscr;
+    int64_t fpscr;
     _FPU_GETCW(fpscr);
     _FPU_SETCW(fpscr | FPSCR_FZ);
     return NULL;
@@ -239,7 +239,7 @@ void UnFlushToZero(void *p)
     } u = { p };
     _mm_setcsr(u.i);
 #elif defined(__arm__) || defined(__aarch64__)
-    int fpscr;
+    int64_t fpscr;
     _FPU_GETCW(fpscr);
     _FPU_SETCW(fpscr & ~FPSCR_FZ);
 #elif defined(__PPC__)
diff --git a/test_common/harness/rounding_mode.h b/test_common/harness/rounding_mode.h
index 064a3a63a8..6f52f0a00b 100644
--- a/test_common/harness/rounding_mode.h
+++ b/test_common/harness/rounding_mode.h
@@ -16,8 +16,6 @@
 #ifndef __ROUNDING_MODE_H__
 #define __ROUNDING_MODE_H__
 
-#pragma STDC FENV_ACCESS ON
-
 #include "compat.h"
 
 #if (defined(_WIN32) && defined(_MSC_VER))
diff --git a/test_common/harness/threadTesting.cpp b/test_common/harness/threadTesting.cpp
index 875ee59b92..e69de29bb2 100644
--- a/test_common/harness/threadTesting.cpp
+++ b/test_common/harness/threadTesting.cpp
@@ -1,98 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "compat.h"
-#include "threadTesting.h"
-#include "errorHelpers.h"
-#include <stdio.h>
-#include <string.h>
-
-#if !defined(_WIN32)
-#include <pthread.h>
-#endif
-
-#if 0 // Disabed for now
-
-typedef struct
-{
-    basefn            mFunction;
-    cl_device_id    mDevice;
-    cl_context        mContext;
-    int                mNumElements;
-} TestFnArgs;
-
-////////////////////////////////////////////////////////////////////////////////
-// Thread-based testing. Spawns a new thread to run the given test function,
-// then waits for it to complete. The entire idea is that, if the thread crashes,
-// we can catch it and report it as a failure instead of crashing the entire suite
-////////////////////////////////////////////////////////////////////////////////
-
-void *test_thread_wrapper( void *data )
-{
-    TestFnArgs *args;
-    int retVal;
-    cl_context context;
-
-    args = (TestFnArgs *)data;
-
-    /* Create a new context to use (contexts can't cross threads) */
-    context = clCreateContext(NULL, args->mDeviceGroup);
-    if( context == NULL )
-    {
-        log_error("clCreateContext failed for new thread\n");
-        return (void *)(-1);
-    }
-
-    /* Call function */
-    retVal = args->mFunction( args->mDeviceGroup, args->mDevice, context, args->mNumElements );
-
-    clReleaseContext( context );
-
-    return (void *)retVal;
-}
-
-int test_threaded_function( basefn fnToTest, cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
-{
-    int error;
-    pthread_t threadHdl;
-    void *retVal;
-    TestFnArgs args;
-
-
-    args.mFunction = fnToTest;
-    args.mDeviceGroup = deviceGroup;
-    args.mDevice = device;
-    args.mContext = context;
-    args.mNumElements = numElements;
-
-
-    error = pthread_create( &threadHdl, NULL, test_thread_wrapper, (void *)&args );
-    if( error != 0 )
-    {
-        log_error( "ERROR: Unable to create thread for testing!\n" );
-        return -1;
-    }
-
-    /* Thread has been started, now just wait for it to complete (or crash) */
-    error = pthread_join( threadHdl, &retVal );
-    if( error != 0 )
-    {
-        log_error( "ERROR: Unable to join testing thread!\n" );
-        return -1;
-    }
-
-    return (int)((intptr_t)retVal);
-}
-#endif
diff --git a/test_common/harness/threadTesting.h b/test_common/harness/threadTesting.h
index 765eabcc61..2f3c1873c7 100644
--- a/test_common/harness/threadTesting.h
+++ b/test_common/harness/threadTesting.h
@@ -24,8 +24,5 @@
 
 typedef int (*basefn)(cl_device_id deviceID, cl_context context,
                       cl_command_queue queue, int num_elements);
-extern int test_threaded_function(basefn fnToTest, cl_device_id device,
-                                  cl_context context, cl_command_queue queue,
-                                  int numElements);
 
-#endif // _threadTesting_h
+#endif // _threadTesting_h
\ No newline at end of file
diff --git a/test_conformance/gles/CMakeLists.txt b/test_conformance/gles/CMakeLists.txt
index c76fe51248..4f4ba53216 100644
--- a/test_conformance/gles/CMakeLists.txt
+++ b/test_conformance/gles/CMakeLists.txt
@@ -18,3 +18,11 @@ set (${MODULE_NAME}_SOURCES
 list(APPEND CLConform_LIBRARIES EGL GLESv2)
 
 include(../CMakeCommon.txt)
+
+if(DEFINED USE_GLES3)
+    target_compile_definitions(${${MODULE_NAME}_OUT} PRIVATE GLES3)
+endif()
+if(MSVC)
+    # Don't warn about using the portable "strdup" function.
+    target_compile_definitions(${${MODULE_NAME}_OUT} PRIVATE _CRT_NONSTDC_NO_DEPRECATE)
+endif()
\ No newline at end of file
diff --git a/test_conformance/gles/setup_egl.cpp b/test_conformance/gles/setup_egl.cpp
index fe0f8ca36f..95a12a667f 100644
--- a/test_conformance/gles/setup_egl.cpp
+++ b/test_conformance/gles/setup_egl.cpp
@@ -117,7 +117,8 @@ class EGLGLEnvironment : public GLEnvironment
                 _platform, "clGetGLContextInfoKHR");
         if (GetGLContextInfo == NULL)
         {
-            print_error(status, "clGetGLContextInfoKHR failed");
+            log_error("ERROR: clGetGLContextInfoKHR failed! (%s:%d)\n",
+                      __FILE__, __LINE__);
             return NULL;
         }
 
@@ -128,7 +129,7 @@ class EGLGLEnvironment : public GLEnvironment
             return NULL;
         }
         dev_size /= sizeof(cl_device_id);
-        log_info("GL _context supports %d compute devices\n", dev_size);
+        log_info("GL _context supports %zu compute devices\n", dev_size);
 
         status =
             GetGLContextInfo(properties, CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR,
diff --git a/test_conformance/gles/test_fence_sync.cpp b/test_conformance/gles/test_fence_sync.cpp
index 0af91a4622..968d969522 100644
--- a/test_conformance/gles/test_fence_sync.cpp
+++ b/test_conformance/gles/test_fence_sync.cpp
@@ -570,10 +570,12 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
                         {
                             if( p[ t ] == 0 )
                             {
-                                log_error( "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n", a, t - 1,
-                                          (int)( a % framebufferSize ), (int)( a / framebufferSize ),
-                                          (int)( ( t - 1 ) % framebufferSize ), (int)( ( t - 1 ) / framebufferSize ),
-                                          p[ a ] );
+                                log_error(
+                                    "RUN: %zu to %zu (%d,%d to %d,%d) 0x%08x\n",
+                                    a, t - 1, (int)(a % framebufferSize),
+                                    (int)(a / framebufferSize),
+                                    (int)((t - 1) % framebufferSize),
+                                    (int)((t - 1) / framebufferSize), p[a]);
                                 a = t;
                             }
                         }
diff --git a/test_conformance/gles/test_images_2D.cpp b/test_conformance/gles/test_images_2D.cpp
index c1a17fc8d8..f6554023f2 100644
--- a/test_conformance/gles/test_images_2D.cpp
+++ b/test_conformance/gles/test_images_2D.cpp
@@ -369,7 +369,9 @@ int test_images_read_cube( cl_device_id device, cl_context context, cl_command_q
 }
 
 
+#ifdef __APPLE__
 #pragma mark -------------------- Write tests -------------------------
+#endif
 
 
 int test_cl_image_write( cl_context context, cl_command_queue queue, cl_mem clImage,
diff --git a/test_conformance/gles/test_renderbuffer.cpp b/test_conformance/gles/test_renderbuffer.cpp
index 20127aca8d..0f6d289b9c 100644
--- a/test_conformance/gles/test_renderbuffer.cpp
+++ b/test_conformance/gles/test_renderbuffer.cpp
@@ -197,7 +197,9 @@ int test_renderbuffer_read( cl_device_id device, cl_context context, cl_command_
 }
 
 
+#ifdef __APPLE__
 #pragma mark -------------------- Write tests -------------------------
+#endif
 
 int test_attach_renderbuffer_write_to_image( cl_context context, cl_command_queue queue, GLenum glTarget, GLuint glRenderbuffer,
                      size_t imageWidth, size_t imageHeight, cl_image_format *outFormat, ExplicitType *outType, MTdata d, void **outSourceBuffer )

From de49d59c8dfad1171d7dd7c0df929ae3a68aea1a Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierremoreau@users.noreply.github.com>
Date: Tue, 18 May 2021 19:12:55 +0200
Subject: [PATCH 095/158] Allocations fixes (#1245)

* allocations: Run buffer non-blocking even without images

Testing buffer non-blocking should not be dependent on whether images
are supported by a device or not.

* allocations: Fix typos
---
 test_conformance/allocations/allocation_fill.cpp      | 6 ++++--
 test_conformance/allocations/allocation_functions.cpp | 4 ++--
 test_conformance/allocations/main.cpp                 | 6 ++++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/test_conformance/allocations/allocation_fill.cpp b/test_conformance/allocations/allocation_fill.cpp
index a75589427b..b4ea379864 100644
--- a/test_conformance/allocations/allocation_fill.cpp
+++ b/test_conformance/allocations/allocation_fill.cpp
@@ -200,8 +200,10 @@ int fill_image_with_data(cl_context context, cl_device_id device_id, cl_command_
       result = clFinish(*queue);
       if (result != SUCCEEDED)
       {
-        print_error(error, "clFinish failed after successful enquing filling buffer with data.");
-        return result;
+          print_error(error,
+                      "clFinish failed after successful enqueuing filling "
+                      "buffer with data.");
+          return result;
       }
     } else {
       error = clEnqueueWriteImage(*queue, mem, CL_FALSE, origin, region, 0, 0, data, 0, NULL, &event);
diff --git a/test_conformance/allocations/allocation_functions.cpp b/test_conformance/allocations/allocation_functions.cpp
index 7182c7271e..827ee1042d 100644
--- a/test_conformance/allocations/allocation_functions.cpp
+++ b/test_conformance/allocations/allocation_functions.cpp
@@ -37,8 +37,8 @@ int find_good_image_size(cl_device_id device_id, size_t size_to_allocate, size_t
   }
 
   if (size_to_allocate == 0) {
-    log_error("Trying to allcoate a zero sized image.\n");
-    return FAILED_ABORT;
+      log_error("Trying to allocate a zero sized image.\n");
+      return FAILED_ABORT;
   }
 
   error = clGetDeviceInfo( device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( max_width ), &max_width, NULL );
diff --git a/test_conformance/allocations/main.cpp b/test_conformance/allocations/main.cpp
index 0dec4c6dd7..43e81277ed 100644
--- a/test_conformance/allocations/main.cpp
+++ b/test_conformance/allocations/main.cpp
@@ -112,6 +112,8 @@ int doTest( cl_device_id device, cl_context context, cl_command_queue queue, All
     int number_of_mems_used;
     cl_ulong max_individual_allocation_size = g_max_individual_allocation_size;
     cl_ulong global_mem_size = g_global_mem_size ;
+    const bool allocate_image =
+        (alloc_type != BUFFER) && (alloc_type != BUFFER_NON_BLOCKING);
 
     static const char* alloc_description[] = {
         "buffer(s)",
@@ -123,7 +125,7 @@ int doTest( cl_device_id device, cl_context context, cl_command_queue queue, All
     };
 
     // Skip image tests if we don't support images on the device
-    if( alloc_type > BUFFER && checkForImageSupport( device ) )
+    if (allocate_image && checkForImageSupport(device))
     {
         log_info( "Can not test image allocation because device does not support images.\n" );
         return 0;
@@ -132,7 +134,7 @@ int doTest( cl_device_id device, cl_context context, cl_command_queue queue, All
     // This section was added in order to fix a bug in the test
     // If CL_DEVICE_MAX_MEM_ALLOC_SIZE is much grater than CL_DEVICE_IMAGE2D_MAX_WIDTH * CL_DEVICE_IMAGE2D_MAX_HEIGHT
     // The test will fail in image allocations as the size requested for the allocation will be much grater than the maximum size allowed for image
-    if( ( alloc_type != BUFFER ) && ( alloc_type != BUFFER_NON_BLOCKING ) )
+    if (allocate_image)
     {
         size_t max_width, max_height;
 

From 01aa55029d49a7c788e9edd97ff686816ff84267 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 21 May 2021 10:06:13 +0100
Subject: [PATCH 096/158] Update warning options (#1252)

Remove workaround for #783, this was fixed by #1237.

Remove workaround for overflow, #699 has been merged.

Disable errors from -Wimplicit-const-int-float-conversion, the issue is
covered by #1250.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b1f48fd86..8d947ed1c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,10 +106,9 @@ if(CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?Clang"
     add_cxx_flag_if_supported(-Wno-format)
     add_cxx_flag_if_supported(-Werror)
     add_cxx_flag_if_supported(-Wno-error=cpp) # Allow #warning directive
-    add_cxx_flag_if_supported(-Wno-error=absolute-value) # Issue 783
     add_cxx_flag_if_supported(-Wno-error=unknown-pragmas) # Issue #785
     add_cxx_flag_if_supported(-Wno-error=asm-operand-widths) # Issue #784
-    add_cxx_flag_if_supported(-Wno-error=overflow) # Fixed by #699
+    add_cxx_flag_if_supported(-Wno-error=implicit-const-int-float-conversion) # Issue #1250
 
     # -msse -mfpmath=sse to force gcc to use sse for float math,
     # avoiding excess precision problems that cause tests like int2float

From ce1687a408686d38e2629a4426ef7c38e10f0e23 Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Fri, 21 May 2021 05:07:12 -0400
Subject: [PATCH 097/158] Add missing cstdint include (#1259)

---
 test_common/harness/fpcontrol.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_common/harness/fpcontrol.h b/test_common/harness/fpcontrol.h
index 2add9baf97..222aa2c40c 100644
--- a/test_common/harness/fpcontrol.h
+++ b/test_common/harness/fpcontrol.h
@@ -16,6 +16,8 @@
 #ifndef _fpcontrol_h
 #define _fpcontrol_h
 
+#include <cstdint>
+
 // In order to get tests for correctly rounded operations (e.g. multiply) to
 // work properly we need to be able to set the reference hardware to FTZ mode if
 // the device hardware is running in that mode.  We have explored all other

From ba9312e4a2e2431a716150a3df3491834076d046 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Fri, 21 May 2021 10:07:54 +0100
Subject: [PATCH 098/158] Fix ODR violations in math_brute_force (#1255)

A program having a type (such as ThreadInfo) defined differently in
multiple translation units exhibits undefined behaviour.

This commit fixes such issues in the math_brute_force component by
ensuring most types are local to their translation unit with the help of
anonymous namespaces. Later refactoring will be able to extract common
definitions to a single place.

This patch also removes unnecessary static and typedef keywords.
Otherwise, code is only moved around with no change.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/binary_double.cpp        | 396 ++++++++---------
 .../math_brute_force/binary_float.cpp         | 398 ++++++++---------
 .../math_brute_force/binary_i_double.cpp      | 399 ++++++++---------
 .../math_brute_force/binary_i_float.cpp       | 401 ++++++++---------
 .../binary_operator_double.cpp                | 392 ++++++++---------
 .../binary_operator_float.cpp                 | 396 ++++++++---------
 .../binary_two_results_i_double.cpp           |  20 +-
 .../binary_two_results_i_float.cpp            |  20 +-
 .../math_brute_force/function_list.cpp        |   1 +
 .../math_brute_force/i_unary_double.cpp       |  14 +-
 .../math_brute_force/i_unary_float.cpp        |  14 +-
 .../math_brute_force/macro_binary_double.cpp  | 355 +++++++--------
 .../math_brute_force/macro_binary_float.cpp   | 357 ++++++++--------
 .../math_brute_force/macro_unary_double.cpp   | 323 +++++++-------
 .../math_brute_force/macro_unary_float.cpp    | 325 +++++++-------
 .../math_brute_force/mad_double.cpp           |  14 +-
 .../math_brute_force/mad_float.cpp            |  14 +-
 .../math_brute_force/ternary_double.cpp       |  18 +-
 .../math_brute_force/ternary_float.cpp        |  18 +-
 .../math_brute_force/unary_double.cpp         | 352 +++++++--------
 .../math_brute_force/unary_float.cpp          | 404 +++++++++---------
 .../unary_two_results_double.cpp              |  14 +-
 .../unary_two_results_float.cpp               |  14 +-
 .../unary_two_results_i_double.cpp            |  16 +-
 .../unary_two_results_i_float.cpp             |  16 +-
 .../math_brute_force/unary_u_double.cpp       |  16 +-
 .../math_brute_force/unary_u_float.cpp        |  14 +-
 27 files changed, 2400 insertions(+), 2321 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 4baa499180..9c6b59b4e5 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -20,10 +20,12 @@
 
 #include <cstring>
 
+namespace {
+
 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -109,7 +111,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -117,9 +119,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -128,7 +130,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
@@ -140,9 +142,9 @@ typedef struct ThreadInfo
                            // to 0.
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -164,10 +166,10 @@ typedef struct TestInfo
     int isNextafter;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -277,195 +279,10 @@ static const double specialValues[] = {
     +0.0,
 };
 
-static size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = 0;
-    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -825,3 +642,188 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 exit:
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = 0;
+    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 32caafa353..9c7081dcff 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -20,10 +20,12 @@
 
 #include <cstring>
 
+namespace {
+
 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -107,7 +109,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -115,9 +117,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -126,7 +128,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
@@ -138,9 +140,9 @@ typedef struct ThreadInfo
                            // to 0.
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -162,10 +164,10 @@ typedef struct TestInfo
     int isNextafter;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -267,196 +269,10 @@ static const float specialValues[] = {
     +0.0f,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    test_info.relaxedMode = relaxedMode;
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
-    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -986,3 +802,189 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     if (overflow) free(overflow);
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
+    test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 69e620aaa5..2fcc8c10b7 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -21,8 +21,10 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -108,7 +110,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -116,9 +118,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -127,7 +129,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
@@ -139,9 +141,9 @@ typedef struct ThreadInfo
                            // to 0.
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -159,10 +161,10 @@ typedef struct TestInfo
     int ftz; // non-zero if running in flush to zero mode
 
     // no special values
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -272,201 +274,18 @@ static const double specialValues[] = {
     +0.0,
 };
 
-static size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static const int specialValuesInt[] = {
+const int specialValuesInt[] = {
     0,       1,  2,  3,  1022,  1023,  1024,   INT_MIN,
     INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX,
 };
-static constexpr size_t specialValuesIntCount =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
 
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
+constexpr size_t specialValuesIntCount =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -744,3 +563,187 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 exit:
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index e65a9aaffc..e1538e3c42 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -21,8 +21,10 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -106,7 +108,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -114,9 +116,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -125,7 +127,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
@@ -137,9 +139,9 @@ typedef struct ThreadInfo
                            // to 0.
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -157,10 +159,10 @@ typedef struct TestInfo
     int ftz; // non-zero if running in flush to zero mode
 
     // no special values
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -262,204 +264,20 @@ static const float specialValues[] = {
     +0.0f,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static const int specialValuesInt[] = {
+const int specialValuesInt[] = {
     0,           1,           2,           3,          126,        127,
     128,         0x02000001,  0x04000001,  1465264071, 1488522147, -1,
     -2,          -3,          -126,        -127,       -128,       -0x02000001,
     -0x04000001, -1465264071, -1488522147,
 };
-static size_t specialValuesIntCount =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
 
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
+constexpr size_t specialValuesIntCount =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -738,3 +556,188 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 exit:
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 21e76c8553..605a31444e 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -20,9 +20,11 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *operator_symbol, int vectorSize,
-                       cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                       bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *operator_symbol, int vectorSize,
+                cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -108,7 +110,7 @@ static int BuildKernel(const char *operator_symbol, int vectorSize,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -116,9 +118,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -127,7 +129,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
@@ -139,9 +141,9 @@ typedef struct ThreadInfo
                            // to 0.
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -161,10 +163,10 @@ typedef struct TestInfo
                       // otherwise.
 
     // no special fields
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -274,192 +276,10 @@ static const double specialValues[] = {
     +0.0,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
-                                           bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -793,3 +613,185 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 exit:
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
+                                           bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index ccaef604be..8448af5458 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -20,9 +20,11 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *operator_symbol, int vectorSize,
-                       cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                       bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *operator_symbol, int vectorSize,
+                cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -106,7 +108,7 @@ static int BuildKernel(const char *operator_symbol, int vectorSize,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -114,9 +116,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -125,7 +127,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
@@ -137,9 +139,9 @@ typedef struct ThreadInfo
                            // to 0.
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -159,10 +161,10 @@ typedef struct TestInfo
                       // otherwise.
 
     // no special fields
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -264,194 +266,10 @@ static const float specialValues[] = {
     +0.0f,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
-                                        bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    test_info.relaxedMode = relaxedMode;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -923,3 +741,187 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     if (overflow) free(overflow);
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
+                                        bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
index 14f4109239..43dc1d304e 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -21,8 +21,10 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -115,16 +117,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -132,7 +134,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
-typedef struct ComputeReferenceInfoD_
+struct ComputeReferenceInfoD
 {
     const double *x;
     const double *y;
@@ -141,9 +143,9 @@ typedef struct ComputeReferenceInfoD_
     long double (*f_ffpI)(long double, long double, int *);
     cl_uint lim;
     cl_uint count;
-} ComputeReferenceInfoD;
+};
 
-static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
+cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
     cl_uint lim = cri->lim;
@@ -165,6 +167,8 @@ static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     return CL_SUCCESS;
 }
 
+} // anonymous namespace
+
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
index 5ef44b6e0c..83ceeaabf6 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -21,8 +21,10 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -113,16 +115,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -130,7 +132,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
-typedef struct ComputeReferenceInfoF_
+struct ComputeReferenceInfoF
 {
     const float *x;
     const float *y;
@@ -139,9 +141,9 @@ typedef struct ComputeReferenceInfoF_
     double (*f_ffpI)(double, double, int *);
     cl_uint lim;
     cl_uint count;
-} ComputeReferenceInfoF;
+};
 
-static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
     cl_uint lim = cri->lim;
@@ -161,6 +163,8 @@ static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
     return CL_SUCCESS;
 }
 
+} // anonymous namespace
+
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 3edbb4854d..917362852c 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -53,6 +53,7 @@
         STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
+
 #define unaryF NULL
 #define i_unaryF NULL
 #define unaryF_u NULL
diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
index 4383fa8b2f..d09e14c128 100644
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -100,16 +102,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -117,6 +119,8 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
+} // anonymous namespace
+
 int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
index c803aa3252..89b566d99c 100644
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -98,16 +100,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -115,6 +117,8 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
+} // anonymous namespace
+
 int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index d09915f6ee..1128126129 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -107,7 +109,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -115,9 +117,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -126,16 +128,16 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -150,11 +152,10 @@ typedef struct TestInfo
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
-
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -264,174 +265,10 @@ static const double specialValues[] = {
     +0.0,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (size_t i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -735,3 +572,167 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 exit:
     return error;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (size_t i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index c530cdafde..6475e4bb06 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -105,7 +107,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -113,9 +115,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -124,16 +126,16 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem inBuf2; // input buffer for the thread
     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     MTdata d;
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -148,11 +150,10 @@ typedef struct TestInfo
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
-
-} TestInfo;
+};
 
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -254,175 +255,10 @@ static const float specialValues[] = {
     +0.0f,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -724,3 +560,168 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 exit:
     return error;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 00e65a2cff..860e45960e 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -101,7 +103,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -109,9 +111,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -120,14 +122,14 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -142,160 +144,9 @@ typedef struct TestInfo
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
+};
 
-} TestInfo;
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -506,3 +357,153 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     return CL_SUCCESS;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 3c1717acb4..58a2a954ba 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -100,7 +102,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -108,9 +110,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -119,14 +121,14 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -141,161 +143,9 @@ typedef struct TestInfo
     cl_uint step; // step between each chunk and the next.
     cl_uint scale; // stride between individual test values
     int ftz; // non-zero if running in flush to zero mode
+};
 
-} TestInfo;
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -521,3 +371,154 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     return ret;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp
index a32cd5a853..8e88f9f624 100644
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -113,16 +115,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -130,6 +132,8 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
+} // anonymous namespace
+
 int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
index 095a22ff3b..0552ba4b96 100644
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -111,16 +113,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -128,6 +130,8 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
+} // anonymous namespace
+
 int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 606fdc5ad4..8af136ac27 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -23,8 +23,10 @@
 #define CORRECTLY_ROUNDED 0
 #define FLUSHED 1
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -116,16 +118,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -134,7 +136,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // A table of more difficult cases to get right
-static const double specialValues[] = {
+const double specialValues[] = {
     -NAN,
     -INFINITY,
     -DBL_MAX,
@@ -202,9 +204,11 @@ static const double specialValues[] = {
     +0.0,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
+} // anonymous namespace
+
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                                          bool relaxedMode)
 {
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index e52c0a0f41..c69083ada1 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -23,8 +23,10 @@
 #define CORRECTLY_ROUNDED 0
 #define FLUSHED 1
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -114,16 +116,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -132,7 +134,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // A table of more difficult cases to get right
-static const float specialValues[] = {
+const float specialValues[] = {
     -NAN,
     -INFINITY,
     -FLT_MAX,
@@ -210,9 +212,11 @@ static const float specialValues[] = {
     +0.0f,
 };
 
-static const size_t specialValuesCount =
+constexpr size_t specialValuesCount =
     sizeof(specialValues) / sizeof(specialValues[0]);
 
+} // anonymous namespace
+
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index f6fa326447..dcd21884c4 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -101,7 +103,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -109,9 +111,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -120,16 +122,16 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     float maxError; // max error value. Init to 0.
     double maxErrorValue; // position of the max error value.  Init to 0.
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -151,174 +153,9 @@ typedef struct TestInfo
     float half_sin_cos_tan_limit;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-    test_info.relaxedMode = relaxedMode;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
+};
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -547,3 +384,168 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     return CL_SUCCESS;
 }
+
+} // anonymous namespace
+
+int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 17edc58d9d..f176fb95a8 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                cl_kernel *k, cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -99,7 +101,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
@@ -107,9 +109,9 @@ typedef struct BuildKernelInfo
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -118,16 +120,16 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
     cl_mem inBuf; // input buffer for the thread
     cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     float maxError; // max error value. Init to 0.
     double maxErrorValue; // position of the max error value.  Init to 0.
     cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+};
 
-typedef struct TestInfo
+struct TestInfo
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
@@ -149,200 +151,9 @@ typedef struct TestInfo
     float half_sin_cos_tan_limit;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
-} TestInfo;
-
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (cl_uint i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gOutBuffer[%d] for region {%zd, %zd}\n",
-                           (int)j, region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Check for special cases for unary float
-    test_info.isRangeLimited = 0;
-    test_info.half_sin_cos_tan_limit = 0;
-    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = 1.0f
-            + test_info.ulps
-                * (FLT_EPSILON / 2.0f); // out of range results from finite
-                                        // inputs must be in [-1,1]
-    }
-    else if (0 == strcmp(f->name, "half_tan"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit =
-            INFINITY; // out of range resut from finite inputs must be numeric
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernelFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting || skipTestingRelaxed)
-    {
-        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        if (skipTestingRelaxed)
-        {
-            vlog(" (rlx skip correctness testing)\n");
-            goto exit;
-        }
-
-        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
+};
 
-static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -725,3 +536,194 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     return CL_SUCCESS;
 }
+
+} // anonymous namespace
+
+int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.relaxedMode = relaxedMode;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Check for special cases for unary float
+    test_info.isRangeLimited = 0;
+    test_info.half_sin_cos_tan_limit = 0;
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
+    }
+    else if (0 == strcmp(f->name, "half_tan"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernelFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting || skipTestingRelaxed)
+    {
+        error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        if (skipTestingRelaxed)
+        {
+            vlog(" (rlx skip correctness testing)\n");
+            goto exit;
+        }
+
+        vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (cl_uint j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
index 71dd4f4431..8757fbc4ee 100644
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -107,16 +109,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -124,6 +126,8 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
+} // anonymous namespace
+
 int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
index 4a375ce3fd..a54bd024c2 100644
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -105,16 +107,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -122,6 +124,8 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
+} // anonymous namespace
+
 int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 14d1fb99f9..9ed77dce39 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -21,8 +21,10 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -108,16 +110,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -125,12 +127,14 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_ulong abs_cl_long(cl_long i)
+cl_ulong abs_cl_long(cl_long i)
 {
     cl_long mask = i >> 63;
     return (i ^ mask) - mask;
 }
 
+} // anonymous namespace
+
 int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
index 23b0d7076b..d048220b1f 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -21,8 +21,10 @@
 #include <climits>
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -106,16 +108,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -123,12 +125,14 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_ulong abs_cl_long(cl_long i)
+cl_ulong abs_cl_long(cl_long i)
 {
     cl_long mask = i >> 63;
     return (i ^ mask) - mask;
 }
 
+} // anonymous namespace
+
 int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
index 3c5f99da5f..9478d0bc2d 100644
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
                         "__kernel void math_kernel",
@@ -102,16 +104,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -119,11 +121,13 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_ulong random64(MTdata d)
+cl_ulong random64(MTdata d)
 {
     return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
 }
 
+} // anonymous namespace
+
 int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
index 44c5af473f..848a9bacdf 100644
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -20,8 +20,10 @@
 
 #include <cstring>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
+namespace {
+
+int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
+                bool relaxedMode)
 {
     const char *c[] = { "__kernel void math_kernel",
                         sizeNames[vectorSize],
@@ -99,16 +101,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-typedef struct BuildKernelInfo
+struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_kernel *kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
+};
 
-static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
@@ -116,6 +118,8 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
                        info->programs + i, info->relaxedMode);
 }
 
+} // anonymous namespace
+
 int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;

From a08cacc67334788e8135964ca8edce373017ac55 Mon Sep 17 00:00:00 2001
From: ouakheli <53617630+ouakheli@users.noreply.github.com>
Date: Mon, 24 May 2021 11:31:37 +0100
Subject: [PATCH 099/158] Fix clang-format-9 install (#1261)

---
 .github/workflows/presubmit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 0c1778ebe6..8ef7e663e0 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: Install packages
-        run: sudo apt install -y clang-format
+        run: sudo apt install -y clang-format clang-format-9
       - uses: actions/checkout@v2
         with:
           fetch-depth: 0

From ed839ebf10c5b7334ac16b0fe13e324f3b47799a Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Mon, 24 May 2021 16:34:54 +0100
Subject: [PATCH 100/158] Avoid manual memory management (#1260)

* Avoid manual memory management

Prefer std::vector over malloc and free. This will allow removing goto
statements by leveraging RAII.

Use appropriate type (bool) to store overflow predicates and allocate
std::vector<bool> of appropriate sizes: before this change the
allocation was unnecessary bigger than required.

No longer attempt to catch "out of host memory" issues, given that in
such situation it is generally not possible to cleanly report an error.
Rely on std::bad_alloc exception to report such issues.

Introduce a new header for common code in the math_brute_force
component. It is currently complementary to utility.h and is expected to
hold cleaned up content extracted from future refactoring operations.

List all headers as source in CMake for better compatibility with IDEs.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Remove manual or unnecessary memset

In order to use non-POD types as fields of TestInfo, memset must be
replaced with a compatible zero-initialisation.

Remove an unnecessary memset in MakeKernels.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/CMakeLists.txt           |  6 ++
 .../math_brute_force/binary_double.cpp        | 76 +++++++-----------
 .../math_brute_force/binary_float.cpp         | 80 +++++++------------
 .../math_brute_force/binary_i_double.cpp      | 76 +++++++-----------
 .../math_brute_force/binary_i_float.cpp       | 76 +++++++-----------
 .../binary_operator_double.cpp                | 76 +++++++-----------
 .../binary_operator_float.cpp                 | 80 +++++++------------
 test_conformance/math_brute_force/common.h    | 27 +++++++
 .../math_brute_force/macro_binary_double.cpp  | 78 +++++++-----------
 .../math_brute_force/macro_binary_float.cpp   | 76 +++++++-----------
 .../math_brute_force/macro_unary_double.cpp   | 72 ++++++-----------
 .../math_brute_force/macro_unary_float.cpp    | 72 ++++++-----------
 test_conformance/math_brute_force/main.cpp    |  8 +-
 .../math_brute_force/unary_double.cpp         | 72 ++++++-----------
 .../math_brute_force/unary_float.cpp          | 72 ++++++-----------
 15 files changed, 366 insertions(+), 581 deletions(-)
 create mode 100644 test_conformance/math_brute_force/common.h

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index d8dfc40322..28d2716f85 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -9,7 +9,9 @@ set(${MODULE_NAME}_SOURCES
     binary_operator_float.cpp
     binary_two_results_i_double.cpp
     binary_two_results_i_float.cpp
+    common.h
     function_list.cpp
+    function_list.h
     i_unary_double.cpp
     i_unary_float.cpp
     macro_binary_double.cpp
@@ -20,9 +22,12 @@ set(${MODULE_NAME}_SOURCES
     mad_float.cpp
     main.cpp
     reference_math.cpp
+    reference_math.h
     sleep.cpp
+    sleep.h
     ternary_double.cpp
     ternary_float.cpp
+    test_functions.h
     unary_double.cpp
     unary_float.cpp
     unary_two_results_double.cpp
@@ -32,6 +37,7 @@ set(${MODULE_NAME}_SOURCES
     unary_u_double.cpp
     unary_u_float.cpp
     utility.cpp
+    utility.h
 )
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
index 9c6b59b4e5..a2b7d28bac 100644
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -115,7 +116,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -126,7 +127,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -149,11 +151,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -284,11 +289,11 @@ constexpr size_t specialValuesCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
@@ -647,7 +652,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
@@ -656,7 +661,6 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -685,27 +689,10 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -802,27 +789,20 @@ int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp
index 9c7081dcff..97712ee8b9 100644
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -113,7 +114,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -124,7 +125,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -147,11 +149,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -274,18 +279,18 @@ constexpr size_t specialValuesCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     MTdata d = tinfo->d;
     cl_int error;
-    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    std::vector<bool> overflow(buffer_elements, false);
     const char *name = job->f->name;
     int isFDim = job->isFDim;
     int skipNanInf = job->skipNanInf;
@@ -447,7 +452,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             vlog_error("Error: clFinish failed! err: %d\n", error);
             goto exit;
         }
-        free(overflow);
         return CL_SUCCESS;
     }
 
@@ -799,7 +803,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
 exit:
-    if (overflow) free(overflow);
     return error;
 }
 
@@ -807,7 +810,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
@@ -816,7 +819,6 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -846,27 +848,10 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -963,27 +948,20 @@ int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index 2fcc8c10b7..f15c21ede2 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -114,7 +115,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -125,7 +126,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -148,11 +150,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -287,11 +292,11 @@ constexpr size_t specialValuesIntCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
@@ -568,7 +573,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
@@ -577,7 +582,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -602,27 +606,10 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -722,27 +709,20 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index e1538e3c42..9e27b00730 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -112,7 +113,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -123,7 +124,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -146,11 +148,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -279,11 +284,11 @@ constexpr size_t specialValuesIntCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
     float ulps = job->ulps;
@@ -561,7 +566,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
@@ -570,7 +575,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -596,27 +600,10 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -716,27 +703,20 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 605a31444e..c407fdaaf1 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -114,7 +115,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -125,7 +126,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->operator_symbol, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -148,11 +150,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -281,11 +286,11 @@ constexpr size_t specialValuesCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     int ftz = job->ftz;
@@ -619,7 +624,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                                            bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
@@ -628,7 +633,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -653,27 +657,10 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -770,27 +757,20 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index 8448af5458..7fbb07c280 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -112,7 +113,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -123,7 +124,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->operator_symbol, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -146,11 +148,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -271,18 +276,18 @@ constexpr size_t specialValuesCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     MTdata d = tinfo->d;
     cl_int error;
-    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    std::vector<bool> overflow(buffer_elements, false);
     const char *name = job->f->name;
     cl_uint *t = 0;
     cl_float *r = 0;
@@ -445,7 +450,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
     if (gSkipCorrectnessTesting)
     {
-        free(overflow);
         return CL_SUCCESS;
     }
 
@@ -738,7 +742,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
 exit:
-    if (overflow) free(overflow);
     return error;
 }
 
@@ -747,7 +750,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                                         bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
@@ -756,7 +759,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -783,27 +785,10 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -900,27 +885,20 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h
new file mode 100644
index 0000000000..3eafb6de30
--- /dev/null
+++ b/test_conformance/math_brute_force/common.h
@@ -0,0 +1,27 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef COMMON_H
+#define COMMON_H
+
+#include "utility.h"
+
+#include <array>
+#include <vector>
+
+// Array of thread-specific kernels for each vector size.
+using KernelMatrix = std::array<std::vector<cl_kernel>, VECTOR_SIZE_COUNT>;
+
+#endif /* COMMON_H */
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 1128126129..6db6aa5680 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -113,7 +114,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -124,7 +125,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -142,11 +144,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -270,11 +275,11 @@ constexpr size_t specialValuesCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     dptr dfunc = job->f->dfunc;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
@@ -577,13 +582,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -607,28 +611,11 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (size_t i = 0; i < test_info.threadCount; i++)
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
             i * test_info.subBufferSize * sizeof(cl_double),
@@ -711,27 +698,20 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index 6475e4bb06..d6d5c8eb98 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -111,7 +112,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -122,7 +123,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -140,11 +142,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -260,11 +265,11 @@ constexpr size_t specialValuesCount =
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
@@ -565,13 +570,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -596,27 +600,10 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -700,27 +687,20 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        free_mtdata(threadInfo.d);
+        clReleaseMemObject(threadInfo.inBuf);
+        clReleaseMemObject(threadInfo.inBuf2);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 860e45960e..1978c185db 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -107,7 +108,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -118,7 +119,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -134,11 +136,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -148,12 +153,12 @@ struct TestInfo
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     dptr dfunc = job->f->dfunc;
     int ftz = job->ftz;
     cl_int error;
@@ -362,13 +367,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -392,27 +396,10 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -484,25 +471,18 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        clReleaseMemObject(threadInfo.inBuf);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index 58a2a954ba..ece5e9b6af 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -106,7 +107,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -117,7 +118,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -133,11 +135,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -147,12 +152,12 @@ struct TestInfo
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     int ftz = job->ftz;
     cl_int error = CL_SUCCESS;
@@ -376,13 +381,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -407,27 +411,10 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -499,25 +486,18 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        clReleaseMemObject(threadInfo.inBuf);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index e52f2f0a18..6691f4626c 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -1055,8 +1055,6 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
                 cl_uint kernel_count, cl_kernel *k, cl_program *p,
                 bool relaxedMode)
 {
-    int error = 0;
-    cl_uint i;
     char options[200] = "";
 
     if (gForceFTZ)
@@ -1074,7 +1072,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
         strcat(options, " -cl-fast-relaxed-math");
     }
 
-    error =
+    int error =
         create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options);
     if (error != CL_SUCCESS)
     {
@@ -1082,9 +1080,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
         return error;
     }
 
-
-    memset(k, 0, kernel_count * sizeof(*k));
-    for (i = 0; i < kernel_count; i++)
+    for (cl_uint i = 0; i < kernel_count; i++)
     {
         k[i] = clCreateKernel(*p, name, &error);
         if (NULL == k[i] || error)
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
index dcd21884c4..2d45504772 100644
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -107,7 +108,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -118,7 +119,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -136,11 +138,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -157,12 +162,12 @@ struct TestInfo
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     float ulps = job->ulps;
     dptr func = job->f->dfunc;
     cl_int error;
@@ -389,14 +394,13 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -422,27 +426,10 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -526,25 +513,18 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        clReleaseMemObject(threadInfo.inBuf);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index f176fb95a8..83d27b0b9b 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -105,7 +106,7 @@ struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
     cl_uint kernel_count;
-    cl_kernel **kernels;
+    KernelMatrix &kernels;
     cl_program *programs;
     const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
@@ -116,7 +117,8 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
+                       info->kernels[i].data(), info->programs + i,
+                       info->relaxedMode);
 }
 
 // Thread specific data for a worker thread
@@ -134,11 +136,14 @@ struct TestInfo
     size_t subBufferSize; // Size of the sub-buffer in elements
     const Func *f; // A pointer to the function info
     cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
     cl_uint threadCount; // Number of worker threads
     cl_uint jobCount; // Number of jobs
     cl_uint step; // step between each chunk and the next.
@@ -155,12 +160,12 @@ struct TestInfo
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *)data;
+    TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
     cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
     fptr func = job->f->func;
     const char *fname = job->f->name;
     bool relaxedMode = job->relaxedMode;
@@ -541,7 +546,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
 int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo test_info;
+    TestInfo test_info{};
     cl_int error;
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
@@ -550,7 +555,6 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
@@ -576,27 +580,10 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // every thread
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
+        test_info.k[i].resize(test_info.threadCount, nullptr);
     }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
+
+    test_info.tinfo.resize(test_info.threadCount, ThreadInfo{});
     for (cl_uint i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = {
@@ -704,25 +691,18 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
+        for (auto &kernel : test_info.k[i])
         {
-            for (cl_uint j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
+            clReleaseKernel(kernel);
         }
     }
-    if (test_info.tinfo)
-    {
-        for (cl_uint i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
 
-        free(test_info.tinfo);
+    for (auto &threadInfo : test_info.tinfo)
+    {
+        clReleaseMemObject(threadInfo.inBuf);
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+            clReleaseMemObject(threadInfo.outBuf[j]);
+        clReleaseCommandQueue(threadInfo.tQueue);
     }
 
     return error;

From 0876ea10be4783340683c9970c5899ac8ed1d6ab Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Mon, 24 May 2021 16:59:03 +0100
Subject: [PATCH 101/158] Ignore padding bits in clCopyImage/clFillImage
 testing (#1184)

The CL_UNORM_SHORT_555 and CL_UNORM_INT_101010 formats contain padding
bits which need to be ignored in clCopyImage and clFillImage testing.

For clFillImage tests, padding was not ignored for the CL_UNORM_SHORT_555
format, and was ignored for CL_UNORM_INT_101010 by modifying actual and
reference data.  For clCopyImage tests, padding was not ignored, both for
CL_UNORM_SHORT_555 and for CL_UNORM_INT_101010.

Fix this by adding a new compare_scanlines() function, which is used for
both of these formats, and does not modify the actual or reference data.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_common/harness/imageHelpers.cpp          | 41 +++++++++++++++++++
 test_common/harness/imageHelpers.h            |  3 ++
 .../images/clCopyImage/test_copy_generic.cpp  | 23 ++++++-----
 .../images/clFillImage/test_fill_generic.cpp  | 28 +++++--------
 4 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index d1754653e4..314709f82c 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -479,6 +479,47 @@ void print_first_pixel_difference_error(size_t where, const char *sourcePixel,
     }
 }
 
+size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr,
+                         const char *bPtr)
+{
+    size_t pixel_size = get_pixel_size(imageInfo->format);
+    size_t column;
+
+    for (column = 0; column < imageInfo->width; column++)
+    {
+        switch (imageInfo->format->image_channel_data_type)
+        {
+            // If the data type is 101010, then ignore bits 31 and 32 when
+            // comparing the row
+            case CL_UNORM_INT_101010: {
+                cl_uint aPixel = *(cl_uint *)aPtr;
+                cl_uint bPixel = *(cl_uint *)bPtr;
+                if ((aPixel & 0x3fffffff) != (bPixel & 0x3fffffff))
+                    return column;
+            }
+            break;
+
+            // If the data type is 555, ignore bit 15 when comparing the row
+            case CL_UNORM_SHORT_555: {
+                cl_ushort aPixel = *(cl_ushort *)aPtr;
+                cl_ushort bPixel = *(cl_ushort *)bPtr;
+                if ((aPixel & 0x7fff) != (bPixel & 0x7fff)) return column;
+            }
+            break;
+
+            default:
+                if (memcmp(aPtr, bPtr, pixel_size) != 0) return column;
+                break;
+        }
+
+        aPtr += pixel_size;
+        bPtr += pixel_size;
+    }
+
+    // If we didn't find a difference, return the width of the image
+    return column;
+}
+
 int random_log_in_range(int minV, int maxV, MTdata d)
 {
     double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1);
diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h
index 848ec65574..e728a939c2 100644
--- a/test_common/harness/imageHelpers.h
+++ b/test_common/harness/imageHelpers.h
@@ -139,6 +139,9 @@ void print_first_pixel_difference_error(size_t where, const char *sourcePixel,
                                         image_descriptor *imageInfo, size_t y,
                                         size_t thirdDim);
 
+size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr,
+                         const char *bPtr);
+
 void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
                    size_t sizes[][3], size_t maxWidth, size_t maxHeight,
                    size_t maxDepth, size_t maxArraySize,
diff --git a/test_conformance/images/clCopyImage/test_copy_generic.cpp b/test_conformance/images/clCopyImage/test_copy_generic.cpp
index 026916e8cb..bd935e7f34 100644
--- a/test_conformance/images/clCopyImage/test_copy_generic.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_generic.cpp
@@ -547,18 +547,19 @@ int test_copy_image_generic( cl_context context, cl_command_queue queue, image_d
         {
             if( memcmp( sourcePtr, destPtr, scanlineSize ) != 0 )
             {
-                // Find the first missing pixel
+                // Find the first differing pixel
                 size_t pixel_size = get_pixel_size( dstImageInfo->format );
-                size_t where = 0;
-                for( where = 0; where < dstImageInfo->width; where++ )
-                    if( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) )
-                        break;
-
-                print_first_pixel_difference_error(
-                    where, sourcePtr + pixel_size * where,
-                    destPtr + pixel_size * where, dstImageInfo, y,
-                    dstImageInfo->depth);
-                return -1;
+                size_t where =
+                    compare_scanlines(dstImageInfo, sourcePtr, destPtr);
+
+                if (where < dstImageInfo->width)
+                {
+                    print_first_pixel_difference_error(
+                        where, sourcePtr + pixel_size * where,
+                        destPtr + pixel_size * where, dstImageInfo, y,
+                        dstImageInfo->depth);
+                    return -1;
+                }
             }
             sourcePtr += rowPitch;
             if((dstImageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY || dstImageInfo->type == CL_MEM_OBJECT_IMAGE1D))
diff --git a/test_conformance/images/clFillImage/test_fill_generic.cpp b/test_conformance/images/clFillImage/test_fill_generic.cpp
index 59bf24ad21..6cd6beb0ea 100644
--- a/test_conformance/images/clFillImage/test_fill_generic.cpp
+++ b/test_conformance/images/clFillImage/test_fill_generic.cpp
@@ -468,27 +468,19 @@ int test_fill_image_generic( cl_context context, cl_command_queue queue, image_d
     {
         for ( size_t y = 0; y < secondDim; y++ )
         {
-            // If the data type is 101010 ignore bits 31 and 32 when comparing the row
-            if (imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010) {
-              for (size_t w=0;w!=scanlineSize/4;++w) {
-                ((cl_uint*)sourcePtr)[w] &= 0x3FFFFFFF;
-                ((cl_uint*)destPtr)[w] &= 0x3FFFFFFF;
-              }
-            }
-
             if (memcmp( sourcePtr, destPtr, scanlineSize ) != 0)
             {
-                // Find the first missing pixel
+                // Find the first differing pixel
                 size_t pixel_size = get_pixel_size( imageInfo->format );
-                size_t where = 0;
-                for ( where = 0; where < imageInfo->width; where++ )
-                    if ( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) )
-                        break;
-
-                print_first_pixel_difference_error(
-                    where, sourcePtr + pixel_size * where,
-                    destPtr + pixel_size * where, imageInfo, y, thirdDim);
-                return -1;
+                size_t where = compare_scanlines(imageInfo, sourcePtr, destPtr);
+
+                if (where < imageInfo->width)
+                {
+                    print_first_pixel_difference_error(
+                        where, sourcePtr + pixel_size * where,
+                        destPtr + pixel_size * where, imageInfo, y, thirdDim);
+                    return -1;
+                }
             }
 
             total_matched += scanlineSize;

From bd3135dd016aae7ae6454725ef3761d132a38926 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Thu, 27 May 2021 10:05:27 +0200
Subject: [PATCH 102/158] Extend list of known extensions (#1262)

---
 .../compiler/test_compiler_defines_for_extensions.cpp    | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index 483adac9a2..a1d8d8bdb5 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -49,7 +49,7 @@ const char *known_extensions[] = {
     "cl_khr_subgroup_shuffle",
     "cl_khr_subgroup_shuffle_relative",
     "cl_khr_subgroup_clustered_reduce",
-
+    "cl_khr_extended_bit_ops",
     // API-only extensions after this point.  If you add above here, modify
     // first_API_extension below.
     "cl_khr_icd",
@@ -71,10 +71,13 @@ const char *known_extensions[] = {
     "cl_khr_spirv_no_integer_wrap_decoration",
     "cl_khr_extended_versioning",
     "cl_khr_device_uuid",
+    "cl_khr_pci_bus_info",
+    "cl_khr_suggested_local_work_size",
+    "cl_khr_spirv_linkonce_odr",
 };
 
-size_t num_known_extensions = sizeof(known_extensions)/sizeof(char*);
-size_t first_API_extension = 27;
+size_t num_known_extensions = sizeof(known_extensions) / sizeof(char *);
+size_t first_API_extension = 28;
 
 const char *known_embedded_extensions[] = {
     "cles_khr_int64",

From 315998511abe3959be21962a696911b43d4d5f59 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 27 May 2021 09:06:13 +0100
Subject: [PATCH 103/158] Address data race in ThreadPool (#1265)

ThreadSanitizer detects some data race in ThreadPool. They stem from
inappropriate usage of volatile which are replaced with std::atomic
variables in this patch.

This patch focuses on data races identified while running the
math_brute_force component. For example, it doesn't fully remove usage
of ThreadPool_AtomicAdd from other components of the CTS. Furthermore,
thread leaks, most likely because threads are not joined, are not
addressed.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_common/harness/ThreadPool.cpp | 44 +++++++++++++-----------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/test_common/harness/ThreadPool.cpp b/test_common/harness/ThreadPool.cpp
index 5dae1b4aa3..627980458c 100644
--- a/test_common/harness/ThreadPool.cpp
+++ b/test_common/harness/ThreadPool.cpp
@@ -22,6 +22,8 @@
 #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
 // or any other POSIX system
 
+#include <atomic>
+
 #if defined(_WIN32)
 #include <windows.h>
 #if defined(_MSC_VER)
@@ -241,7 +243,7 @@ pthread_cond_t cond_var;
 // Condition variable state. How many iterations on the function left to run,
 // set to CL_INT_MAX to cause worker threads to exit. Note: this value might
 // go negative.
-volatile cl_int gRunCount = 0;
+std::atomic<cl_int> gRunCount{ 0 };
 
 // State that only changes when the threadpool is not working.
 volatile TPFuncPtr gFunc_ptr = NULL;
@@ -261,19 +263,20 @@ pthread_cond_t caller_cond_var;
 
 // # of threads intended to be running. Running threads will decrement this
 // as they discover they've run out of work to do.
-volatile cl_int gRunning = 0;
+std::atomic<cl_int> gRunning{ 0 };
 
 // The total number of threads launched.
-volatile cl_int gThreadCount = 0;
+std::atomic<cl_int> gThreadCount{ 0 };
+
 #ifdef _WIN32
 void ThreadPool_WorkerFunc(void *p)
 #else
 void *ThreadPool_WorkerFunc(void *p)
 #endif
 {
-    cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1);
-    cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1);
-    // log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );
+    auto &tid = *static_cast<std::atomic<cl_uint> *>(p);
+    cl_uint threadID = tid++;
+    cl_int item = gRunCount--;
 
     while (MAX_COUNT > item)
     {
@@ -282,8 +285,6 @@ void *ThreadPool_WorkerFunc(void *p)
         // check for more work to do
         if (0 >= item)
         {
-            // log_info("Thread %d has run out of work.\n", threadID);
-
             // No work to do. Attempt to block waiting for work
 #if defined(_WIN32)
             EnterCriticalSection(cond_lock);
@@ -298,9 +299,7 @@ void *ThreadPool_WorkerFunc(void *p)
             }
 #endif // !_WIN32
 
-            cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1);
-            // log_info("ThreadPool_WorkerFunc: gRunning = %d\n",
-            //          remaining - 1);
+            cl_int remaining = gRunning--;
             if (1 == remaining)
             { // last thread out signal the main thread to wake up
 #if defined(_WIN32)
@@ -350,7 +349,7 @@ void *ThreadPool_WorkerFunc(void *p)
 #endif // !_WIN32
 
                 // try again to get a valid item id
-                item = ThreadPool_AtomicAdd(&gRunCount, -1);
+                item = gRunCount--;
                 if (MAX_COUNT <= item) // exit if we are done
                 {
 #if defined(_WIN32)
@@ -362,8 +361,7 @@ void *ThreadPool_WorkerFunc(void *p)
                 }
             }
 
-            ThreadPool_AtomicAdd(&gRunning, 1);
-            // log_info("Thread %d has found work.\n", threadID);
+            gRunning++;
 
 #if defined(_WIN32)
             LeaveCriticalSection(cond_lock);
@@ -447,12 +445,12 @@ void *ThreadPool_WorkerFunc(void *p)
         }
 
         // get the next item
-        item = ThreadPool_AtomicAdd(&gRunCount, -1);
+        item = gRunCount--;
     }
 
 exit:
     log_info("ThreadPool: thread %d exiting.\n", threadID);
-    ThreadPool_AtomicAdd(&gThreadCount, -1);
+    gThreadCount--;
 #if !defined(_WIN32)
     return NULL;
 #endif
@@ -487,7 +485,7 @@ void ThreadPool_Init(void)
 {
     cl_int i;
     int err;
-    volatile cl_uint threadID = 0;
+    std::atomic<cl_uint> threadID{ 0 };
 
     // Check for manual override of multithreading code. We add this for better
     // debuggability.
@@ -624,7 +622,7 @@ void ThreadPool_Init(void)
     }
 #endif // !_WIN32
 
-    gRunning = gThreadCount;
+    gRunning = gThreadCount.load();
     // init threads
     for (i = 0; i < gThreadCount; i++)
     {
@@ -688,10 +686,6 @@ static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter,
 
 void ThreadPool_Exit(void)
 {
-#ifndef _WIN32
-    int err;
-#endif
-    int count;
     gRunCount = CL_INT_MAX;
 
 #if defined(__GNUC__)
@@ -705,13 +699,13 @@ void ThreadPool_Exit(void)
 #endif
 
     // spin waiting for threads to die
-    for (count = 0; 0 != gThreadCount && count < 1000; count++)
+    for (int count = 0; 0 != gThreadCount && count < 1000; count++)
     {
 #if defined(_WIN32)
         _WakeAllConditionVariable(cond_var);
         Sleep(1);
 #else // !_WIN32
-        if ((err = pthread_cond_broadcast(&cond_var)))
+        if (int err = pthread_cond_broadcast(&cond_var))
         {
             log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
                       "work threads. ThreadPool_Exit failed.\n",
@@ -725,7 +719,7 @@ void ThreadPool_Exit(void)
     if (gThreadCount)
         log_error("Error: Thread pool timed out after 1 second with %d threads "
                   "still active.\n",
-                  gThreadCount);
+                  gThreadCount.load());
     else
         log_info("Thread pool exited in a orderly fashion.\n");
 }

From 76ace61314e061fbf0f8a058dab19fa7e04df937 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 9 Jun 2021 11:08:08 +0100
Subject: [PATCH 104/158] Fix leaks in callSingleTestFunction (#1224)

The context and queue were not released when the test is not supported
in offline mode or the queue couldn't be created.

Inline test_missing_support_offline_cmpiler_ret macro, remove dead
parameter of check_functions_for_offline_compiler and slightly refactor
callSingleTestFunction to address leaks.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_common/harness/errorHelpers.cpp | 15 +++++++--------
 test_common/harness/errorHelpers.h   | 18 +-----------------
 test_common/harness/testHarness.cpp  | 12 +++++++++---
 3 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index 3ddbc37bcf..ea928bc395 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -21,6 +21,7 @@
 #include "errorHelpers.h"
 
 #include "parseParameters.h"
+#include "testHarness.h"
 
 #include <CL/cl_half.h>
 
@@ -690,21 +691,19 @@ const char *subtests_to_skip_with_offline_compiler[] = {
     "library_function"
 };
 
-int check_functions_for_offline_compiler(const char *subtestname,
-                                         cl_device_id device)
+bool check_functions_for_offline_compiler(const char *subtestname)
 {
     if (gCompilationMode != kOnline)
     {
         size_t nNotRequiredWithOfflineCompiler =
-            sizeof(subtests_to_skip_with_offline_compiler) / sizeof(char *);
-        size_t i;
-        for (i = 0; i < nNotRequiredWithOfflineCompiler; ++i)
+            ARRAY_SIZE(subtests_to_skip_with_offline_compiler);
+        for (size_t i = 0; i < nNotRequiredWithOfflineCompiler; ++i)
         {
             if (!strcmp(subtestname, subtests_to_skip_with_offline_compiler[i]))
             {
-                return 1;
+                return false;
             }
         }
     }
-    return 0;
-}
\ No newline at end of file
+    return true;
+}
diff --git a/test_common/harness/errorHelpers.h b/test_common/harness/errorHelpers.h
index c7f49e3dd7..d59bc78de0 100644
--- a/test_common/harness/errorHelpers.h
+++ b/test_common/harness/errorHelpers.h
@@ -92,21 +92,6 @@ static int vlog_win32(const char *format, ...);
                         "the device version! (from %s:%d)\n",                  \
                         msg, __FILE__, __LINE__);
 
-#define test_missing_support_offline_cmpiler(errCode, msg)                     \
-    test_missing_support_offline_cmpiler_ret(errCode, msg, errCode)
-// this macro should always return CL_SUCCESS, but print the skip message on
-// test not supported with offline compiler
-#define test_missing_support_offline_cmpiler_ret(errCode, msg, retValue)       \
-    {                                                                          \
-        if (errCode != CL_SUCCESS)                                             \
-        {                                                                      \
-            log_info("INFO: Subtest %s tests is not supported in offline "     \
-                     "compiler execution path! (from %s:%d)\n",                \
-                     msg, __FILE__, __LINE__);                                 \
-            return TEST_SKIP;                                                  \
-        }                                                                      \
-    }
-
 // expected error code vs. what we got
 #define test_failure_error(errCode, expectedErrCode, msg)                      \
     test_failure_error_ret(errCode, expectedErrCode, msg,                      \
@@ -181,8 +166,7 @@ extern const char *GetAddressModeName(cl_addressing_mode mode);
 extern const char *GetQueuePropertyName(cl_command_queue_properties properties);
 
 extern const char *GetDeviceTypeName(cl_device_type type);
-int check_functions_for_offline_compiler(const char *subtestname,
-                                         cl_device_id device);
+bool check_functions_for_offline_compiler(const char *subtestname);
 cl_int OutputBuildLogs(cl_program program, cl_uint num_devices,
                        cl_device_id *device_list);
 
diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 1aec3d0789..b3863918da 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -783,6 +783,14 @@ test_status callSingleTestFunction(test_definition test,
         return TEST_SKIP;
     }
 
+    if (!check_functions_for_offline_compiler(test.name))
+    {
+        log_info("Subtest %s tests is not supported in offline compiler "
+                 "execution path!\n",
+                 test.name);
+        return TEST_SKIP;
+    }
+
     /* Create a context to work with, unless we're told not to */
     if (!forceNoContextCreation)
     {
@@ -812,14 +820,12 @@ test_status callSingleTestFunction(test_definition test,
         if (queue == NULL)
         {
             print_error(error, "Unable to create testing command queue");
+            clReleaseContext(context);
             return TEST_FAIL;
         }
     }
 
     /* Run the test and print the result */
-    error = check_functions_for_offline_compiler(test.name, deviceToUse);
-    test_missing_support_offline_cmpiler(error, test.name);
-
     if (test.func == NULL)
     {
         // Skip unimplemented test, can happen when all of the tests are

From 277d029608ed0f7fdb0823f010d653dd0169c82c Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Fri, 11 Jun 2021 09:42:20 +0100
Subject: [PATCH 105/158] Run spirv-val for SPIR-V offline compilation (#1108)

The common --disable-spirv-validation option has been added to disable
this functionality.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_common/harness/kernelHelpers.cpp   | 22 ++++++++++++++++-
 test_common/harness/parseParameters.cpp | 32 ++++++++++++++++++++++++-
 test_common/harness/parseParameters.h   |  2 ++
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/test_common/harness/kernelHelpers.cpp b/test_common/harness/kernelHelpers.cpp
index aaf0d689ca..18f51cbe26 100644
--- a/test_common/harness/kernelHelpers.cpp
+++ b/test_common/harness/kernelHelpers.cpp
@@ -530,7 +530,7 @@ static int get_offline_compiler_output(
                                                 sourceFilename, outputFilename);
             if (error != CL_SUCCESS) return error;
 
-            // read output file
+            // open output file for reading
             ifs.open(outputFilename.c_str(), std::ios::binary);
             if (!ifs.good())
             {
@@ -540,6 +540,26 @@ static int get_offline_compiler_output(
             }
         }
     }
+
+    if (compilationMode == kSpir_v && !gDisableSPIRVValidation)
+    {
+        std::string runString = gSPIRVValidator + " " + outputFilename;
+
+        int returnCode = system(runString.c_str());
+        if (returnCode == -1)
+        {
+            log_error("Error: failed to invoke SPIR-V validator\n");
+            return CL_COMPILE_PROGRAM_FAILURE;
+        }
+        else if (returnCode != 0)
+        {
+            log_error(
+                "Failed to validate SPIR-V file %s: system() returned 0x%x\n",
+                outputFilename.c_str(), returnCode);
+            return CL_COMPILE_PROGRAM_FAILURE;
+        }
+    }
+
     return CL_SUCCESS;
 }
 
diff --git a/test_common/harness/parseParameters.cpp b/test_common/harness/parseParameters.cpp
index b2ab5b0223..e946d744a4 100644
--- a/test_common/harness/parseParameters.cpp
+++ b/test_common/harness/parseParameters.cpp
@@ -28,11 +28,14 @@
 using namespace std;
 
 #define DEFAULT_COMPILATION_PROGRAM "cl_offline_compiler"
+#define DEFAULT_SPIRV_VALIDATOR "spirv-val"
 
 CompilationMode gCompilationMode = kOnline;
 CompilationCacheMode gCompilationCacheMode = kCacheModeCompileIfAbsent;
 std::string gCompilationCachePath = ".";
 std::string gCompilationProgram = DEFAULT_COMPILATION_PROGRAM;
+bool gDisableSPIRVValidation = false;
+std::string gSPIRVValidator = DEFAULT_SPIRV_VALIDATOR;
 
 void helpInfo()
 {
@@ -62,7 +65,14 @@ For offline compilation (binary and spir-v modes) only:
         Path for offline compiler output and CL source
     --compilation-program <prog>
         Program to use for offline compilation, defaults to:
-            )" DEFAULT_COMPILATION_PROGRAM "\n\n");
+            )" DEFAULT_COMPILATION_PROGRAM R"(
+
+For spir-v mode only:
+    --disable-spirv-validation
+        Disable validation of SPIR-V using the SPIR-V validator
+    --spirv-validator
+        Path for SPIR-V validator, defaults to )" DEFAULT_SPIRV_VALIDATOR "\n"
+        "\n");
 }
 
 int parseCustomParam(int argc, const char *argv[], const char *ignore)
@@ -198,6 +208,26 @@ int parseCustomParam(int argc, const char *argv[], const char *ignore)
                 return -1;
             }
         }
+        else if (!strcmp(argv[i], "--disable-spirv-validation"))
+        {
+            delArg++;
+            gDisableSPIRVValidation = true;
+        }
+        else if (!strcmp(argv[i], "--spirv-validator"))
+        {
+            delArg++;
+            if ((i + 1) < argc)
+            {
+                delArg++;
+                gSPIRVValidator = argv[i + 1];
+            }
+            else
+            {
+                log_error("Program argument for --spirv-validator was not "
+                          "specified.\n");
+                return -1;
+            }
+        }
 
         // cleaning parameters from argv tab
         for (int j = i; j < argc - delArg; j++) argv[j] = argv[j + delArg];
diff --git a/test_common/harness/parseParameters.h b/test_common/harness/parseParameters.h
index b0f8328a14..437e12f94d 100644
--- a/test_common/harness/parseParameters.h
+++ b/test_common/harness/parseParameters.h
@@ -38,6 +38,8 @@ extern CompilationMode gCompilationMode;
 extern CompilationCacheMode gCompilationCacheMode;
 extern std::string gCompilationCachePath;
 extern std::string gCompilationProgram;
+extern bool gDisableSPIRVValidation;
+extern std::string gSPIRVValidator;
 
 extern int parseCustomParam(int argc, const char *argv[],
                             const char *ignore = 0);

From 80a4a833be9bc390574801dc5a47b02a579bf47b Mon Sep 17 00:00:00 2001
From: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com>
Date: Fri, 11 Jun 2021 09:44:16 +0100
Subject: [PATCH 106/158] Minor fixes for CL_UNORM_SHORT_565,
 CL_UNORM_SHORT_555 (#1129)

* Minor fixes for CL_UNORM_SHORT_565, CL_UNORM_SHORT_555

* Fix verification for undefined bit
* Relax current infinitely precision requirement for these formats
  and move check in common function.
* Add proper debug output.

Signed-off-by: John Kesapides <john.kesapides@arm.com>

* Minor Formating fix.

Signed-off-by: John Kesapides <john.kesapides@arm.com>
---
 .../images/kernel_read_write/test_common.cpp  | 37 ++++++++-
 .../images/kernel_read_write/test_common.h    |  5 ++
 .../kernel_read_write/test_write_1D.cpp       | 70 ++++++++++++----
 .../kernel_read_write/test_write_1D_array.cpp | 71 ++++++++++++----
 .../kernel_read_write/test_write_2D_array.cpp | 80 +++++++++++++++----
 .../kernel_read_write/test_write_3D.cpp       | 80 +++++++++++++++----
 .../kernel_read_write/test_write_image.cpp    | 72 +++++++++++++----
 7 files changed, 339 insertions(+), 76 deletions(-)

diff --git a/test_conformance/images/kernel_read_write/test_common.cpp b/test_conformance/images/kernel_read_write/test_common.cpp
index e76710b577..375ee5877e 100644
--- a/test_conformance/images/kernel_read_write/test_common.cpp
+++ b/test_conformance/images/kernel_read_write/test_common.cpp
@@ -1543,4 +1543,39 @@ int test_read_image(cl_context context, cl_command_queue queue,
     }
 
     return numTries != MAX_TRIES || numClamped != MAX_CLAMPED;
-}
\ No newline at end of file
+}
+
+void filter_undefined_bits(image_descriptor *imageInfo, char *resultPtr)
+{
+    // mask off the top bit (bit 15) if the image format is (CL_UNORM_SHORT_555,
+    // CL_RGB). (Note: OpenCL says: the top bit is undefined meaning it can be
+    // either 0 or 1.)
+    if (imageInfo->format->image_channel_data_type == CL_UNORM_SHORT_555)
+    {
+        cl_ushort *temp = (cl_ushort *)resultPtr;
+        temp[0] &= 0x7fff;
+    }
+}
+
+int filter_rounding_errors(int forceCorrectlyRoundedWrites,
+                           image_descriptor *imageInfo, float *errors)
+{
+    // We are allowed 0.6 absolute error vs. infinitely precise for some
+    // normalized formats
+    if (0 == forceCorrectlyRoundedWrites
+        && (imageInfo->format->image_channel_data_type == CL_UNORM_INT8
+            || imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010
+            || imageInfo->format->image_channel_data_type == CL_UNORM_INT16
+            || imageInfo->format->image_channel_data_type == CL_SNORM_INT8
+            || imageInfo->format->image_channel_data_type == CL_SNORM_INT16
+            || imageInfo->format->image_channel_data_type == CL_UNORM_SHORT_555
+            || imageInfo->format->image_channel_data_type
+                == CL_UNORM_SHORT_565))
+    {
+        if (!(fabsf(errors[0]) > 0.6f) && !(fabsf(errors[1]) > 0.6f)
+            && !(fabsf(errors[2]) > 0.6f) && !(fabsf(errors[3]) > 0.6f))
+            return 0;
+    }
+
+    return 1;
+}
diff --git a/test_conformance/images/kernel_read_write/test_common.h b/test_conformance/images/kernel_read_write/test_common.h
index e7ecbe0b55..656c41f47d 100644
--- a/test_conformance/images/kernel_read_write/test_common.h
+++ b/test_conformance/images/kernel_read_write/test_common.h
@@ -229,3 +229,8 @@ int determine_validation_error_offset(
     }
     return 0;
 }
+
+
+extern int filter_rounding_errors(int forceCorrectlyRoundedWrites,
+                                  image_descriptor *imageInfo, float *errors);
+extern void filter_undefined_bits(image_descriptor *imageInfo, char *resultPtr);
diff --git a/test_conformance/images/kernel_read_write/test_write_1D.cpp b/test_conformance/images/kernel_read_write/test_write_1D.cpp
index 41983edf75..1556a76a09 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "../testBase.h"
+#include "test_common.h"
 
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -395,6 +396,8 @@ int test_write_image_1D( cl_device_id device, cl_context context, cl_command_que
                     }
                     else
                     {
+                        filter_undefined_bits(imageInfo, resultPtr);
+
                         // Exact result passes every time
                         if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
                         {
@@ -403,21 +406,8 @@ int test_write_image_1D( cl_device_id device, cl_context context, cl_command_que
                             float errors[4] = {NAN, NAN, NAN, NAN};
                             pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
 
-                            // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                            if( 0 == forceCorrectlyRoundedWrites    &&
-                               (
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                ))
-                            {
-                                if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                   ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                    failure = 0;
-                            }
-
+                            failure = filter_rounding_errors(
+                                forceCorrectlyRoundedWrites, imageInfo, errors);
 
                             if( failure )
                             {
@@ -458,6 +448,56 @@ int test_write_image_1D( cl_device_id device, cl_context context, cl_command_que
                                         log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                         log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
                                         break;
+                                    case CL_UNORM_SHORT_565: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x3F,
+                                                  (ref_value[0] >> 11) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x3F,
+                                                  (test_value[0] >> 11) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
+                                    case CL_UNORM_SHORT_555: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x1F,
+                                                  (ref_value[0] >> 10) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x1F,
+                                                  (test_value[0] >> 10) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
                                     case CL_UNORM_INT16:
                                     case CL_SNORM_INT16:
                                     case CL_UNSIGNED_INT16:
diff --git a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
index c771704cad..e9aa8d2a6a 100644
--- a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "../testBase.h"
+#include "test_common.h"
 
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -415,6 +416,9 @@ int test_write_image_1D_array( cl_device_id device, cl_context context, cl_comma
                     }
                     else
                     {
+
+                        filter_undefined_bits(imageInfo, resultPtr);
+
                         // Exact result passes every time
                         if( memcmp( resultBuffer, resultPtr, pixelSize ) != 0 )
                         {
@@ -423,21 +427,8 @@ int test_write_image_1D_array( cl_device_id device, cl_context context, cl_comma
                             float errors[4] = {NAN, NAN, NAN, NAN};
                             pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
 
-                            // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                            if( 0 == forceCorrectlyRoundedWrites    &&
-                               (
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                ))
-                            {
-                                if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                   ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                    failure = 0;
-                            }
-
+                            failure = filter_rounding_errors(
+                                forceCorrectlyRoundedWrites, imageInfo, errors);
 
                             if( failure )
                             {
@@ -478,6 +469,56 @@ int test_write_image_1D_array( cl_device_id device, cl_context context, cl_comma
                                         log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                         log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
                                         break;
+                                    case CL_UNORM_SHORT_565: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x3F,
+                                                  (ref_value[0] >> 11) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x3F,
+                                                  (test_value[0] >> 11) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
+                                    case CL_UNORM_SHORT_555: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x1F,
+                                                  (ref_value[0] >> 10) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x1F,
+                                                  (test_value[0] >> 10) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
                                     case CL_UNORM_INT16:
                                     case CL_SNORM_INT16:
                                     case CL_UNSIGNED_INT16:
diff --git a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
index 08a7a80334..5bca7124e8 100644
--- a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "../testBase.h"
+#include "test_common.h"
 
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -438,6 +439,9 @@ int test_write_image_2D_array( cl_device_id device, cl_context context, cl_comma
                         }
                         else
                         {
+
+                            filter_undefined_bits(imageInfo, resultPtr);
+
                             // Exact result passes every time
                             if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
                             {
@@ -446,21 +450,9 @@ int test_write_image_2D_array( cl_device_id device, cl_context context, cl_comma
                                 float errors[4] = {NAN, NAN, NAN, NAN};
                                 pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
 
-                                // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                                if( 0 == forceCorrectlyRoundedWrites    &&
-                                   (
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                    ))
-                                {
-                                    if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                       ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                        failure = 0;
-                                }
-
+                                failure = filter_rounding_errors(
+                                    forceCorrectlyRoundedWrites, imageInfo,
+                                    errors);
 
                                 if( failure )
                                 {
@@ -501,6 +493,64 @@ int test_write_image_2D_array( cl_device_id device, cl_context context, cl_comma
                                             log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                             log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
                                             break;
+                                        case CL_UNORM_SHORT_565: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x3F,
+                                                (ref_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x3F,
+                                                (test_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
+                                        case CL_UNORM_SHORT_555: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x1F,
+                                                (ref_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x1F,
+                                                (test_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
                                         case CL_UNORM_INT16:
                                         case CL_SNORM_INT16:
                                         case CL_UNSIGNED_INT16:
diff --git a/test_conformance/images/kernel_read_write/test_write_3D.cpp b/test_conformance/images/kernel_read_write/test_write_3D.cpp
index 5cc96bb4b0..d9a69627b1 100644
--- a/test_conformance/images/kernel_read_write/test_write_3D.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_3D.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "../testBase.h"
+#include "test_common.h"
 
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -445,6 +446,9 @@ int test_write_image_3D( cl_device_id device, cl_context context, cl_command_que
                         }
                         else
                         {
+
+                            filter_undefined_bits(imageInfo, resultPtr);
+
                             // Exact result passes every time
                             if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
                             {
@@ -453,21 +457,9 @@ int test_write_image_3D( cl_device_id device, cl_context context, cl_command_que
                                 float errors[4] = {NAN, NAN, NAN, NAN};
                                 pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
 
-                                // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                                if( 0 == forceCorrectlyRoundedWrites    &&
-                                   (
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                    imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                    imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                    ))
-                                {
-                                    if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                       ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                        failure = 0;
-                                }
-
+                                failure = filter_rounding_errors(
+                                    forceCorrectlyRoundedWrites, imageInfo,
+                                    errors);
 
                                 if( failure )
                                 {
@@ -508,6 +500,64 @@ int test_write_image_3D( cl_device_id device, cl_context context, cl_command_que
                                             log_error( "    Actual:   0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] );
                                             log_error( "    Error:    %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
                                             break;
+                                        case CL_UNORM_SHORT_565: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x3F,
+                                                (ref_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x3F,
+                                                (test_value[0] >> 11) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
+                                        case CL_UNORM_SHORT_555: {
+                                            cl_uint *ref_value =
+                                                (cl_uint *)resultBuffer;
+                                            cl_uint *test_value =
+                                                (cl_uint *)resultPtr;
+
+                                            log_error(" Expected: 0x%2.2x "
+                                                      "Actual: 0x%2.2x \n",
+                                                      ref_value[0],
+                                                      test_value[0]);
+
+                                            log_error(
+                                                "    Expected: 0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                ref_value[0] & 0x1F,
+                                                (ref_value[0] >> 5) & 0x1F,
+                                                (ref_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Actual:   0x%2.2x 0x%2.2x "
+                                                "0x%2.2x \n",
+                                                test_value[0] & 0x1F,
+                                                (test_value[0] >> 5) & 0x1F,
+                                                (test_value[0] >> 10) & 0x1F);
+                                            log_error(
+                                                "    Error:    %f %f %f %f\n",
+                                                errors[0], errors[1],
+                                                errors[2]);
+                                            break;
+                                        }
                                         case CL_UNORM_INT16:
                                         case CL_SNORM_INT16:
                                         case CL_UNSIGNED_INT16:
diff --git a/test_conformance/images/kernel_read_write/test_write_image.cpp b/test_conformance/images/kernel_read_write/test_write_image.cpp
index e40e80d61f..9cc9698ce2 100644
--- a/test_conformance/images/kernel_read_write/test_write_image.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_image.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "../testBase.h"
+#include "test_common.h"
 
 #if !defined(_WIN32)
 #include <sys/mman.h>
@@ -477,6 +478,9 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue
                     }
                     else
                     {
+
+                        filter_undefined_bits(imageInfo, resultPtr);
+
                         // Exact result passes every time
                         if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 )
                         {
@@ -485,21 +489,8 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue
                             float errors[4] = {NAN, NAN, NAN, NAN};
                             pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors );
 
-                            // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats
-                            if( 0 == forceCorrectlyRoundedWrites    &&
-                               (
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 ||
-                                imageInfo->format->image_channel_data_type == CL_UNORM_INT16 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT8 ||
-                                imageInfo->format->image_channel_data_type == CL_SNORM_INT16
-                                ))
-                            {
-                                if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) &&
-                                   ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f)  )
-                                    failure = 0;
-                            }
-
+                            failure = filter_rounding_errors(
+                                forceCorrectlyRoundedWrites, imageInfo, errors);
 
                             if( failure )
                             {
@@ -577,6 +568,57 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue
                                         log_error( "    Actual:   %a %a %a %a\n", ((cl_float*)resultPtr)[0], ((cl_float*)resultPtr)[1], ((cl_float*)resultPtr)[2], ((cl_float*)resultPtr)[3] );
                                         log_error( "    Ulps:     %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] );
                                         break;
+                                    case CL_UNORM_SHORT_565: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x3F,
+                                                  (ref_value[0] >> 11) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x3F,
+                                                  (test_value[0] >> 11) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
+
+                                    case CL_UNORM_SHORT_555: {
+                                        cl_uint *ref_value =
+                                            (cl_uint *)resultBuffer;
+                                        cl_uint *test_value =
+                                            (cl_uint *)resultPtr;
+
+                                        log_error(" Expected: 0x%2.2x Actual: "
+                                                  "0x%2.2x \n",
+                                                  ref_value[0], test_value[0]);
+
+                                        log_error("    Expected: 0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  ref_value[0] & 0x1F,
+                                                  (ref_value[0] >> 5) & 0x1F,
+                                                  (ref_value[0] >> 10) & 0x1F);
+                                        log_error("    Actual:   0x%2.2x "
+                                                  "0x%2.2x 0x%2.2x \n",
+                                                  test_value[0] & 0x1F,
+                                                  (test_value[0] >> 5) & 0x1F,
+                                                  (test_value[0] >> 10) & 0x1F);
+                                        log_error("    Error:    %f %f %f %f\n",
+                                                  errors[0], errors[1],
+                                                  errors[2]);
+                                        break;
+                                    }
                                 }
 
                                 float *v = (float *)(char *)imagePtr;

From 69f0054001438078c11478546b855c06e07e1817 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 17 Jun 2021 14:05:05 +0100
Subject: [PATCH 107/158] Fix copy and move semantics of wrapper classes
 (#1268)

* Remove unnecessary code

These custom equality operators are not necessary because of the
conversion operators which already allow using the standard equality
operators between two pointers.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Fix copy and move semantics of wrapper classes

Related to #465.

The Wrapper classes are rewritten to properly handle copy and move
semantics, while preserving the existing API and removing code
duplication.

Add error handling around clRelase* and clRetain*.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Address build issue on 32-bit Windows

Include linkage in RetainReleaseType function type.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_common/harness/typeWrappers.h            | 246 +++++++-----------
 test_conformance/buffers/test_sub_buffers.cpp |   3 +-
 2 files changed, 91 insertions(+), 158 deletions(-)

diff --git a/test_common/harness/typeWrappers.h b/test_common/harness/typeWrappers.h
index 9a58a9d2b8..50c7c9387f 100644
--- a/test_common/harness/typeWrappers.h
+++ b/test_common/harness/typeWrappers.h
@@ -16,122 +16,134 @@
 #ifndef _typeWrappers_h
 #define _typeWrappers_h
 
-#include <stdio.h>
-#include <stdlib.h>
-
 #if !defined(_WIN32)
 #include <sys/mman.h>
 #endif
 
 #include "compat.h"
-#include <stdio.h>
 #include "mt19937.h"
 #include "errorHelpers.h"
 #include "kernelHelpers.h"
 
-/* cl_context wrapper */
+#include <cstdlib>
+#include <type_traits>
 
-class clContextWrapper {
-public:
-    clContextWrapper() { mContext = NULL; }
-    clContextWrapper(cl_context program) { mContext = program; }
-    ~clContextWrapper()
-    {
-        if (mContext != NULL) clReleaseContext(mContext);
-    }
+namespace wrapper_details {
+
+// clRetain*() and clRelease*() functions share the same type.
+template <typename T> // T should be cl_context, cl_program, ...
+using RetainReleaseType = cl_int CL_API_CALL(T);
 
-    clContextWrapper &operator=(const cl_context &rhs)
+// A generic wrapper class that follows OpenCL retain/release semantics.
+//
+// This Wrapper class implement copy and move semantics, which makes it
+// compatible with standard containers for example.
+//
+// Template parameters:
+//  - T is the cl_* type (e.g. cl_context, cl_program, ...)
+//  - Retain is the clRetain* function (e.g. clRetainContext, ...)
+//  - Release is the clRelease* function (e.g. clReleaseContext, ...)
+template <typename T, RetainReleaseType<T> Retain, RetainReleaseType<T> Release>
+class Wrapper {
+    static_assert(std::is_pointer<T>::value, "T should be a pointer type.");
+    T object = nullptr;
+
+    void retain()
     {
-        mContext = rhs;
-        return *this;
+        if (!object) return;
+
+        auto err = Retain(object);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clRetain*() failed");
+            std::abort();
+        }
     }
-    operator cl_context() const { return mContext; }
 
-    cl_context *operator&() { return &mContext; }
+    void release()
+    {
+        if (!object) return;
 
-    bool operator==(const cl_context &rhs) { return mContext == rhs; }
+        auto err = Release(object);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "clRelease*() failed");
+            std::abort();
+        }
+    }
 
-protected:
-    cl_context mContext;
-};
+public:
+    Wrapper() = default;
 
-/* cl_program wrapper */
+    // On initialisation, assume the object has a refcount of one.
+    Wrapper(T object): object(object) {}
 
-class clProgramWrapper {
-public:
-    clProgramWrapper() { mProgram = NULL; }
-    clProgramWrapper(cl_program program) { mProgram = program; }
-    ~clProgramWrapper()
+    // On assignment, assume the object has a refcount of one.
+    Wrapper &operator=(T rhs)
     {
-        if (mProgram != NULL) clReleaseProgram(mProgram);
+        reset(rhs);
+        return *this;
     }
 
-    clProgramWrapper &operator=(const cl_program &rhs)
+    // Copy semantics, increase retain count.
+    Wrapper(Wrapper const &w) { *this = w; }
+    Wrapper &operator=(Wrapper const &w)
     {
-        mProgram = rhs;
+        reset(w.object);
+        retain();
         return *this;
     }
-    operator cl_program() const { return mProgram; }
-
-    cl_program *operator&() { return &mProgram; }
 
-    bool operator==(const cl_program &rhs) { return mProgram == rhs; }
-
-protected:
-    cl_program mProgram;
-};
-
-/* cl_kernel wrapper */
-
-class clKernelWrapper {
-public:
-    clKernelWrapper() { mKernel = NULL; }
-    clKernelWrapper(cl_kernel kernel) { mKernel = kernel; }
-    ~clKernelWrapper()
+    // Move semantics, directly take ownership.
+    Wrapper(Wrapper &&w) { *this = std::move(w); }
+    Wrapper &operator=(Wrapper &&w)
     {
-        if (mKernel != NULL) clReleaseKernel(mKernel);
+        reset(w.object);
+        w.object = nullptr;
+        return *this;
     }
 
-    clKernelWrapper &operator=(const cl_kernel &rhs)
+    ~Wrapper() { reset(); }
+
+    // Release the existing object, if any, and own the new one, if any.
+    void reset(T new_object = nullptr)
     {
-        mKernel = rhs;
-        return *this;
+        release();
+        object = new_object;
     }
-    operator cl_kernel() const { return mKernel; }
 
-    cl_kernel *operator&() { return &mKernel; }
+    operator T() const { return object; }
 
-    bool operator==(const cl_kernel &rhs) { return mKernel == rhs; }
-
-protected:
-    cl_kernel mKernel;
+    // Ideally this function should not exist as it breaks encapsulation by
+    // allowing external mutation of the Wrapper internal state. However, too
+    // much code currently relies on this. For example, instead of using T* as
+    // output parameters, existing code can be updated to use Wrapper& instead.
+    T *operator&() { return &object; }
 };
 
-/* cl_mem (stream) wrapper */
+} // namespace wrapper_details
 
-class clMemWrapper {
-public:
-    clMemWrapper() { mMem = NULL; }
-    clMemWrapper(cl_mem mem) { mMem = mem; }
-    ~clMemWrapper()
-    {
-        if (mMem != NULL) clReleaseMemObject(mMem);
-    }
+using clContextWrapper =
+    wrapper_details::Wrapper<cl_context, clRetainContext, clReleaseContext>;
 
-    clMemWrapper &operator=(const cl_mem &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_mem() const { return mMem; }
+using clProgramWrapper =
+    wrapper_details::Wrapper<cl_program, clRetainProgram, clReleaseProgram>;
 
-    cl_mem *operator&() { return &mMem; }
+using clKernelWrapper =
+    wrapper_details::Wrapper<cl_kernel, clRetainKernel, clReleaseKernel>;
 
-    bool operator==(const cl_mem &rhs) { return mMem == rhs; }
+using clMemWrapper =
+    wrapper_details::Wrapper<cl_mem, clRetainMemObject, clReleaseMemObject>;
 
-protected:
-    cl_mem mMem;
-};
+using clCommandQueueWrapper =
+    wrapper_details::Wrapper<cl_command_queue, clRetainCommandQueue,
+                             clReleaseCommandQueue>;
+
+using clSamplerWrapper =
+    wrapper_details::Wrapper<cl_sampler, clRetainSampler, clReleaseSampler>;
+
+using clEventWrapper =
+    wrapper_details::Wrapper<cl_event, clRetainEvent, clReleaseEvent>;
 
 class clProtectedImage {
 public:
@@ -183,92 +195,12 @@ class clProtectedImage {
 
     cl_mem *operator&() { return &image; }
 
-    bool operator==(const cl_mem &rhs) { return image == rhs; }
-
 protected:
     void *backingStore;
     size_t backingStoreSize;
     cl_mem image;
 };
 
-/* cl_command_queue wrapper */
-class clCommandQueueWrapper {
-public:
-    clCommandQueueWrapper() { mMem = NULL; }
-    clCommandQueueWrapper(cl_command_queue mem) { mMem = mem; }
-    ~clCommandQueueWrapper()
-    {
-        if (mMem != NULL)
-        {
-            clReleaseCommandQueue(mMem);
-        }
-    }
-
-    clCommandQueueWrapper &operator=(const cl_command_queue &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_command_queue() const { return mMem; }
-
-    cl_command_queue *operator&() { return &mMem; }
-
-    bool operator==(const cl_command_queue &rhs) { return mMem == rhs; }
-
-protected:
-    cl_command_queue mMem;
-};
-
-/* cl_sampler wrapper */
-class clSamplerWrapper {
-public:
-    clSamplerWrapper() { mMem = NULL; }
-    clSamplerWrapper(cl_sampler mem) { mMem = mem; }
-    ~clSamplerWrapper()
-    {
-        if (mMem != NULL) clReleaseSampler(mMem);
-    }
-
-    clSamplerWrapper &operator=(const cl_sampler &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_sampler() const { return mMem; }
-
-    cl_sampler *operator&() { return &mMem; }
-
-    bool operator==(const cl_sampler &rhs) { return mMem == rhs; }
-
-protected:
-    cl_sampler mMem;
-};
-
-/* cl_event wrapper */
-class clEventWrapper {
-public:
-    clEventWrapper() { mMem = NULL; }
-    clEventWrapper(cl_event mem) { mMem = mem; }
-    ~clEventWrapper()
-    {
-        if (mMem != NULL) clReleaseEvent(mMem);
-    }
-
-    clEventWrapper &operator=(const cl_event &rhs)
-    {
-        mMem = rhs;
-        return *this;
-    }
-    operator cl_event() const { return mMem; }
-
-    cl_event *operator&() { return &mMem; }
-
-    bool operator==(const cl_event &rhs) { return mMem == rhs; }
-
-protected:
-    cl_event mMem;
-};
-
 /* Generic protected memory buffer, for verifying access within bounds */
 class clProtectedArray {
 public:
diff --git a/test_conformance/buffers/test_sub_buffers.cpp b/test_conformance/buffers/test_sub_buffers.cpp
index 3e50121a10..691509fdcc 100644
--- a/test_conformance/buffers/test_sub_buffers.cpp
+++ b/test_conformance/buffers/test_sub_buffers.cpp
@@ -39,7 +39,8 @@ class SubBufferWrapper : public clMemWrapper
         region.size = mSize;
 
         cl_int error;
-        mMem = clCreateSubBuffer( mParentBuffer, flags, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+        reset(clCreateSubBuffer(mParentBuffer, flags,
+                                CL_BUFFER_CREATE_TYPE_REGION, &region, &error));
         return error;
     }
 };

From 236cd73fa17ed0c280b7aa6cd8a3dd116c4e5d2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Fri, 2 Jul 2021 10:34:13 +0100
Subject: [PATCH 108/158] Use macOS 10 in CI (#1282)

macOS jobs frequently fail. Since macos-11.0 support is considered experimental,
move to macos-10, using macos-latest so we automatically move to 11 when
stable.

See https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners

Signed-off-by: Kevin Petit <kevin.petit@arm.com>
---
 .github/workflows/presubmit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 8ef7e663e0..2aedc199f1 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         mainmatrix: [true]
-        os: [ubuntu-20.04, macos-11.0]
+        os: [ubuntu-20.04, macos-latest]
         include:
           - os: ubuntu-20.04
             mainmatrix: true

From 4a03bb79cb8fbd6012b02783e59565cce0b1f376 Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Mon, 5 Jul 2021 10:35:39 -0400
Subject: [PATCH 109/158] Fix double-release of memory objects (#1277)

A recent update to the object wrapper classes (#1268) changed the
behavior of assigning to a wrapper, whereby the wrapped object is now
released upon assignment. A couple of tests were manually calling
clReleaseMemObject and then assigning `nullptr` to the wrapper,
resulting in the wrapper calling clReleaseMemObject on an object that
had already been destroyed.
---
 test_conformance/api/test_mem_object_info.cpp | 7 -------
 test_conformance/api/test_mem_objects.cpp     | 7 +------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/test_conformance/api/test_mem_object_info.cpp b/test_conformance/api/test_mem_object_info.cpp
index ccfeaafa13..2afe043768 100644
--- a/test_conformance/api/test_mem_object_info.cpp
+++ b/test_conformance/api/test_mem_object_info.cpp
@@ -348,14 +348,7 @@ int test_get_buffer_info( cl_device_id deviceID, cl_context context, cl_command_
             TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_ASSOCIATED_MEMOBJECT, origObj, (cl_mem)bufferObject, "associated mem object", "%p", void * )
 
             TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_OFFSET, offset, (size_t)( addressAlign ), "offset", "%ld", size_t )
-
-            clReleaseMemObject( subBufferObject );
-            subBufferObject = NULL;
-
         }
-
-        clReleaseMemObject( bufferObject );
-        bufferObject = NULL;
     }
 
     return CL_SUCCESS;
diff --git a/test_conformance/api/test_mem_objects.cpp b/test_conformance/api/test_mem_objects.cpp
index c29613f9fe..f1a4e99339 100644
--- a/test_conformance/api/test_mem_objects.cpp
+++ b/test_conformance/api/test_mem_objects.cpp
@@ -48,12 +48,7 @@ int test_mem_object_destructor_callback_single(clMemWrapper &memObject)
     test_error(error, "Unable to set destructor callback");
 
     // Now release the buffer, which SHOULD call the callbacks
-    error = clReleaseMemObject(memObject);
-    test_error(error, "Unable to release test buffer");
-
-    // Note: since we manually released the mem wrapper, we need to set it to
-    // NULL to prevent a double-release
-    memObject = NULL;
+    memObject.reset();
 
     // At this point, all three callbacks should have already been called
     int numErrors = 0;

From 433974fd2810f91b093f10121adca64e1eefd789 Mon Sep 17 00:00:00 2001
From: BKoscielak <bartosz.koscielak@intel.com>
Date: Tue, 13 Jul 2021 18:15:33 +0200
Subject: [PATCH 110/158] Fix check for image support in test_basic sizeof
 (#1269)

---
 test_conformance/basic/test_sizeof.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_conformance/basic/test_sizeof.cpp b/test_conformance/basic/test_sizeof.cpp
index 66a6c563cf..6b1ddb56bd 100644
--- a/test_conformance/basic/test_sizeof.cpp
+++ b/test_conformance/basic/test_sizeof.cpp
@@ -292,11 +292,11 @@ int test_sizeof(cl_device_id device, cl_context context, cl_command_queue queue,
             continue;
         }
 
-        if( gIsEmbedded &&
-           0 == strcmp(other_types[i], "image3d_t") &&
-           checkFor3DImageSupport( device ) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
+        if (0 == strcmp(other_types[i], "image3d_t")
+            && checkFor3DImageSupport(device) == CL_IMAGE_FORMAT_NOT_SUPPORTED)
         {
-            log_info("\n3D images are not supported by this device. Skipping test.\t");
+            log_info("\n3D images are not supported by this device. "
+                     "Skipping test.\t");
             continue;
         }
 

From b500da5fbc97a2fc73ee39e30c00e7d759a11215 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Wed, 21 Jul 2021 00:48:48 -0700
Subject: [PATCH 111/158] add basic test for cl_khr_pci_bus_info (#1227)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add basic test for cl_khr_pci_bus_info

* correctly use TEST_SKIPPED_ITSELF

Co-authored-by: Kévin Petit <kpet@free.fr>

* fix related usage of TEST_SKIPPED_ITSELF

Co-authored-by: Kévin Petit <kpet@free.fr>
---
 test_conformance/computeinfo/CMakeLists.txt   |  1 +
 test_conformance/computeinfo/device_uuid.cpp  |  2 +-
 test_conformance/computeinfo/main.cpp         |  3 +-
 test_conformance/computeinfo/pci_bus_info.cpp | 53 +++++++++++++++++++
 4 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 test_conformance/computeinfo/pci_bus_info.cpp

diff --git a/test_conformance/computeinfo/CMakeLists.txt b/test_conformance/computeinfo/CMakeLists.txt
index 207223a3e3..06f0599c11 100644
--- a/test_conformance/computeinfo/CMakeLists.txt
+++ b/test_conformance/computeinfo/CMakeLists.txt
@@ -5,6 +5,7 @@ set(${MODULE_NAME}_SOURCES
         device_uuid.cpp
         extended_versioning.cpp
         conforming_version.cpp
+        pci_bus_info.cpp
 )
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/computeinfo/device_uuid.cpp b/test_conformance/computeinfo/device_uuid.cpp
index 1ef9dad2ed..7f29d0b683 100644
--- a/test_conformance/computeinfo/device_uuid.cpp
+++ b/test_conformance/computeinfo/device_uuid.cpp
@@ -105,7 +105,7 @@ int test_device_uuid(cl_device_id deviceID, cl_context context,
     if (!is_extension_available(deviceID, "cl_khr_device_uuid"))
     {
         log_info("cl_khr_device_uuid not supported. Skipping test...\n");
-        return 0;
+        return TEST_SKIPPED_ITSELF;
     }
 
     int total_errors = 0;
diff --git a/test_conformance/computeinfo/main.cpp b/test_conformance/computeinfo/main.cpp
index 4860b44561..d993655b9a 100644
--- a/test_conformance/computeinfo/main.cpp
+++ b/test_conformance/computeinfo/main.cpp
@@ -1421,15 +1421,16 @@ int test_computeinfo(cl_device_id deviceID, cl_context context,
 extern int test_extended_versioning(cl_device_id, cl_context, cl_command_queue,
                                     int);
 extern int test_device_uuid(cl_device_id, cl_context, cl_command_queue, int);
-
 extern int test_conformance_version(cl_device_id, cl_context, cl_command_queue,
                                     int);
+extern int test_pci_bus_info(cl_device_id, cl_context, cl_command_queue, int);
 
 test_definition test_list[] = {
     ADD_TEST(computeinfo),
     ADD_TEST(extended_versioning),
     ADD_TEST(device_uuid),
     ADD_TEST_VERSION(conformance_version, Version(3, 0)),
+    ADD_TEST(pci_bus_info),
 };
 
 const int test_num = ARRAY_SIZE(test_list);
diff --git a/test_conformance/computeinfo/pci_bus_info.cpp b/test_conformance/computeinfo/pci_bus_info.cpp
new file mode 100644
index 0000000000..cd62ca0513
--- /dev/null
+++ b/test_conformance/computeinfo/pci_bus_info.cpp
@@ -0,0 +1,53 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "harness/compat.h"
+
+#include <array>
+#include <bitset>
+
+#include "harness/testHarness.h"
+#include "harness/deviceInfo.h"
+
+int test_pci_bus_info(cl_device_id deviceID, cl_context context,
+                      cl_command_queue ignoreQueue, int num_elements)
+{
+    if (!is_extension_available(deviceID, "cl_khr_pci_bus_info"))
+    {
+        log_info("cl_khr_pci_bus_info not supported. Skipping test...\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    cl_int error;
+
+    cl_device_pci_bus_info_khr info;
+
+    size_t size_ret;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_PCI_BUS_INFO_KHR, 0, NULL,
+                            &size_ret);
+    test_error(error, "Unable to query CL_DEVICE_PCI_BUS_INFO_KHR size");
+    test_assert_error(
+        size_ret == sizeof(info),
+        "Query for CL_DEVICE_PCI_BUS_INFO_KHR returned an unexpected size");
+
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_PCI_BUS_INFO_KHR, sizeof(info),
+                            &info, NULL);
+    test_error(error, "Unable to query CL_DEVICE_PCI_BUS_INFO_KHR");
+
+    log_info("\tPCI Bus Info: %04x:%02x:%02x.%x\n", info.pci_domain,
+             info.pci_bus, info.pci_device, info.pci_function);
+
+    return TEST_PASS;
+}

From 12637114ac81d292861daf4bff2397a36581f712 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Wed, 21 Jul 2021 09:50:22 +0200
Subject: [PATCH 112/158] Fix double release of object in test_api and test_gl
 (#1287)

* Fix clang format only

* Fix double release of objects
---
 .../api/test_context_destructor_callback.cpp  |   7 +-
 test_conformance/gl/test_buffers.cpp          | 415 +++++++-----
 test_conformance/gl/test_fence_sync.cpp       | 624 ++++++++++--------
 3 files changed, 586 insertions(+), 460 deletions(-)

diff --git a/test_conformance/api/test_context_destructor_callback.cpp b/test_conformance/api/test_context_destructor_callback.cpp
index 1d73a3c486..d29d90390a 100644
--- a/test_conformance/api/test_context_destructor_callback.cpp
+++ b/test_conformance/api/test_context_destructor_callback.cpp
@@ -52,12 +52,7 @@ int test_context_destructor_callback(cl_device_id deviceID, cl_context context,
     test_error(error, "Unable to set destructor callback");
 
     // Now release the context, which SHOULD call the callbacks
-    error = clReleaseContext(localContext);
-    test_error(error, "Unable to release local context");
-
-    // Note: since we manually released the context, we need to set it to NULL
-    // to prevent a double-release
-    localContext = NULL;
+    localContext.reset();
 
     // At this point, all three callbacks should have already been called
     int numErrors = 0;
diff --git a/test_conformance/gl/test_buffers.cpp b/test_conformance/gl/test_buffers.cpp
index 35f01ee6bb..c61610d090 100644
--- a/test_conformance/gl/test_buffers.cpp
+++ b/test_conformance/gl/test_buffers.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,126 +17,126 @@
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 
-#if !defined (__APPLE__)
-    #include <CL/cl_gl.h>
+#if !defined(__APPLE__)
+#include <CL/cl_gl.h>
 #endif
 
 static const char *bufferKernelPattern =
-"__kernel void sample_test( __global %s%s *source, __global %s%s *clDest, __global %s%s *glDest )\n"
-"{\n"
-"    int  tid = get_global_id(0);\n"
-"     clDest[ tid ] = source[ tid ] + (%s%s)(1);\n"
-"     glDest[ tid ] = source[ tid ] + (%s%s)(2);\n"
-"}\n";
-
-#define TYPE_CASE( enum, type, range, offset )    \
-    case enum:    \
-    {                \
-        cl_##type *ptr = (cl_##type *)outData; \
-        for( i = 0; i < count; i++ ) \
-            ptr[ i ] = (cl_##type)( ( genrand_int32(d) & range ) - offset ); \
-        break; \
+    "__kernel void sample_test( __global %s%s *source, __global %s%s *clDest, "
+    "__global %s%s *glDest )\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "     clDest[ tid ] = source[ tid ] + (%s%s)(1);\n"
+    "     glDest[ tid ] = source[ tid ] + (%s%s)(2);\n"
+    "}\n";
+
+#define TYPE_CASE(enum, type, range, offset)                                   \
+    case enum: {                                                               \
+        cl_##type *ptr = (cl_##type *)outData;                                 \
+        for (i = 0; i < count; i++)                                            \
+            ptr[i] = (cl_##type)((genrand_int32(d) & range) - offset);         \
+        break;                                                                 \
     }
 
-void gen_input_data( ExplicitType type, size_t count, MTdata d, void *outData )
+void gen_input_data(ExplicitType type, size_t count, MTdata d, void *outData)
 {
     size_t i;
 
-    switch( type )
+    switch (type)
     {
-        case kBool:
-        {
+        case kBool: {
             bool *boolPtr = (bool *)outData;
-            for( i = 0; i < count; i++ )
+            for (i = 0; i < count; i++)
             {
-                boolPtr[i] = ( genrand_int32(d) & 1 ) ? true : false;
+                boolPtr[i] = (genrand_int32(d) & 1) ? true : false;
             }
             break;
         }
 
-        TYPE_CASE( kChar, char, 250, 127 )
-        TYPE_CASE( kUChar, uchar, 250, 0 )
-        TYPE_CASE( kShort, short, 65530, 32767 )
-        TYPE_CASE( kUShort, ushort, 65530, 0 )
-        TYPE_CASE( kInt, int, 0x0fffffff, 0x70000000 )
-        TYPE_CASE( kUInt, uint, 0x0fffffff, 0 )
+            TYPE_CASE(kChar, char, 250, 127)
+            TYPE_CASE(kUChar, uchar, 250, 0)
+            TYPE_CASE(kShort, short, 65530, 32767)
+            TYPE_CASE(kUShort, ushort, 65530, 0)
+            TYPE_CASE(kInt, int, 0x0fffffff, 0x70000000)
+            TYPE_CASE(kUInt, uint, 0x0fffffff, 0)
 
-        case kLong:
-        {
+        case kLong: {
             cl_long *longPtr = (cl_long *)outData;
-            for( i = 0; i < count; i++ )
+            for (i = 0; i < count; i++)
             {
-                longPtr[i] = (cl_long)genrand_int32(d) | ( (cl_ulong)genrand_int32(d) << 32 );
+                longPtr[i] = (cl_long)genrand_int32(d)
+                    | ((cl_ulong)genrand_int32(d) << 32);
             }
             break;
         }
 
-        case kULong:
-        {
+        case kULong: {
             cl_ulong *ulongPtr = (cl_ulong *)outData;
-            for( i = 0; i < count; i++ )
+            for (i = 0; i < count; i++)
             {
-                ulongPtr[i] = (cl_ulong)genrand_int32(d) | ( (cl_ulong)genrand_int32(d) << 32 );
+                ulongPtr[i] = (cl_ulong)genrand_int32(d)
+                    | ((cl_ulong)genrand_int32(d) << 32);
             }
             break;
         }
 
-        case kFloat:
-        {
+        case kFloat: {
             cl_float *floatPtr = (float *)outData;
-            for( i = 0; i < count; i++ )
-                floatPtr[i] = get_random_float( -100000.f, 100000.f, d );
+            for (i = 0; i < count; i++)
+                floatPtr[i] = get_random_float(-100000.f, 100000.f, d);
             break;
         }
 
         default:
-            log_error( "ERROR: Invalid type passed in to generate_random_data!\n" );
+            log_error(
+                "ERROR: Invalid type passed in to generate_random_data!\n");
             break;
     }
 }
 
-#define INC_CASE( enum, type )    \
-    case enum:    \
-    {                \
-        cl_##type *src = (cl_##type *)inData; \
-        cl_##type *dst = (cl_##type *)outData; \
-        *dst = *src + 1; \
-        break; \
+#define INC_CASE(enum, type)                                                   \
+    case enum: {                                                               \
+        cl_##type *src = (cl_##type *)inData;                                  \
+        cl_##type *dst = (cl_##type *)outData;                                 \
+        *dst = *src + 1;                                                       \
+        break;                                                                 \
     }
 
-void get_incremented_value( void *inData, void *outData, ExplicitType type )
+void get_incremented_value(void *inData, void *outData, ExplicitType type)
 {
-    switch( type )
+    switch (type)
     {
-        INC_CASE( kChar, char )
-        INC_CASE( kUChar, uchar )
-        INC_CASE( kShort, short )
-        INC_CASE( kUShort, ushort )
-        INC_CASE( kInt, int )
-        INC_CASE( kUInt, uint )
-        INC_CASE( kLong, long )
-        INC_CASE( kULong, ulong )
-        INC_CASE( kFloat, float )
-        default:
-            break;
+        INC_CASE(kChar, char)
+        INC_CASE(kUChar, uchar)
+        INC_CASE(kShort, short)
+        INC_CASE(kUShort, ushort)
+        INC_CASE(kInt, int)
+        INC_CASE(kUInt, uint)
+        INC_CASE(kLong, long)
+        INC_CASE(kULong, ulong)
+        INC_CASE(kFloat, float)
+        default: break;
     }
 }
 
-int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType vecType, size_t vecSize, int numElements, int validate_only, MTdata d)
+int test_buffer_kernel(cl_context context, cl_command_queue queue,
+                       ExplicitType vecType, size_t vecSize, int numElements,
+                       int validate_only, MTdata d)
 {
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper streams[ 3 ];
+    clMemWrapper streams[3];
     size_t dataSize = numElements * 16 * sizeof(cl_long);
 #if !(defined(_WIN32) && defined(_MSC_VER))
-    cl_long inData[numElements * 16], outDataCL[numElements * 16], outDataGL[ numElements * 16 ];
+    cl_long inData[numElements * 16], outDataCL[numElements * 16],
+        outDataGL[numElements * 16];
 #else
-    cl_long* inData    = (cl_long*)_malloca(dataSize);
-    cl_long* outDataCL = (cl_long*)_malloca(dataSize);
-    cl_long* outDataGL = (cl_long*)_malloca(dataSize);
+    cl_long *inData = (cl_long *)_malloca(dataSize);
+    cl_long *outDataCL = (cl_long *)_malloca(dataSize);
+    cl_long *outDataGL = (cl_long *)_malloca(dataSize);
 #endif
     glBufferWrapper inGLBuffer, outGLBuffer;
-    int    i;
+    int i;
     size_t bufferSize;
 
     int error;
@@ -146,210 +146,259 @@ int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType
     char sizeName[4];
 
     /* Create the source */
-    if( vecSize == 1 )
-        sizeName[ 0 ] = 0;
+    if (vecSize == 1)
+        sizeName[0] = 0;
     else
-        sprintf( sizeName, "%d", (int)vecSize );
+        sprintf(sizeName, "%d", (int)vecSize);
 
-    sprintf( kernelSource, bufferKernelPattern, get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName,
-                                                get_explicit_type_name( vecType ), sizeName );
+    sprintf(kernelSource, bufferKernelPattern, get_explicit_type_name(vecType),
+            sizeName, get_explicit_type_name(vecType), sizeName,
+            get_explicit_type_name(vecType), sizeName,
+            get_explicit_type_name(vecType), sizeName,
+            get_explicit_type_name(vecType), sizeName);
 
     /* Create kernels */
     programPtr = kernelSource;
-    if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programPtr, "sample_test"))
     {
         return -1;
     }
 
-    bufferSize = numElements * vecSize * get_explicit_type_size( vecType );
+    bufferSize = numElements * vecSize * get_explicit_type_size(vecType);
 
     /* Generate some almost-random input data */
-    gen_input_data( vecType, vecSize * numElements, d, inData );
-    memset( outDataCL, 0, dataSize );
-    memset( outDataGL, 0, dataSize );
+    gen_input_data(vecType, vecSize * numElements, d, inData);
+    memset(outDataCL, 0, dataSize);
+    memset(outDataGL, 0, dataSize);
 
     /* Generate some GL buffers to go against */
-    glGenBuffers( 1, &inGLBuffer );
-    glGenBuffers( 1, &outGLBuffer );
+    glGenBuffers(1, &inGLBuffer);
+    glGenBuffers(1, &outGLBuffer);
 
-    glBindBuffer( GL_ARRAY_BUFFER, inGLBuffer );
-    glBufferData( GL_ARRAY_BUFFER, bufferSize, inData, GL_STATIC_DRAW );
+    glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, bufferSize, inData, GL_STATIC_DRAW);
 
-    // Note: we need to bind the output buffer, even though we don't care about its values yet,
-    // because CL needs it to get the buffer size
-    glBindBuffer( GL_ARRAY_BUFFER, outGLBuffer );
-    glBufferData( GL_ARRAY_BUFFER, bufferSize, outDataGL, GL_STATIC_DRAW );
+    // Note: we need to bind the output buffer, even though we don't care about
+    // its values yet, because CL needs it to get the buffer size
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, bufferSize, outDataGL, GL_STATIC_DRAW);
 
-    glBindBuffer( GL_ARRAY_BUFFER, 0 );
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
     glFinish();
 
 
-    /* Generate some streams. The first and last ones are GL, middle one just vanilla CL */
-    streams[ 0 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_ONLY, inGLBuffer, &error );
-    test_error( error, "Unable to create input GL buffer" );
+    /* Generate some streams. The first and last ones are GL, middle one just
+     * vanilla CL */
+    streams[0] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_ONLY,
+                                             inGLBuffer, &error);
+    test_error(error, "Unable to create input GL buffer");
 
-    streams[ 1 ] = clCreateBuffer( context, CL_MEM_READ_WRITE, bufferSize, NULL, &error );
-    test_error( error, "Unable to create output CL buffer" );
+    streams[1] =
+        clCreateBuffer(context, CL_MEM_READ_WRITE, bufferSize, NULL, &error);
+    test_error(error, "Unable to create output CL buffer");
 
-    streams[ 2 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_WRITE_ONLY, outGLBuffer, &error );
-    test_error( error, "Unable to create output GL buffer" );
+    streams[2] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_WRITE_ONLY,
+                                             outGLBuffer, &error);
+    test_error(error, "Unable to create output GL buffer");
 
 
-  /* Validate the info */
-  if (validate_only) {
-    int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER, (GLuint)inGLBuffer, (GLenum)0, 0) |
-                  CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER, (GLuint)outGLBuffer, (GLenum)0, 0) );
-    for(i=0;i<3;i++)
+    /* Validate the info */
+    if (validate_only)
     {
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
-    }
+        int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER,
+                                        (GLuint)inGLBuffer, (GLenum)0, 0)
+                      | CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER,
+                                          (GLuint)outGLBuffer, (GLenum)0, 0));
+        for (i = 0; i < 3; i++)
+        {
+            streams[i].reset();
+        }
 
-    glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;
-    glDeleteBuffers(1, &outGLBuffer);    outGLBuffer = 0;
+        glDeleteBuffers(1, &inGLBuffer);
+        inGLBuffer = 0;
+        glDeleteBuffers(1, &outGLBuffer);
+        outGLBuffer = 0;
 
-    return result;
-  }
+        return result;
+    }
 
     /* Assign streams and execute */
-    for( int i = 0; i < 3; i++ )
+    for (int i = 0; i < 3; i++)
     {
-        error = clSetKernelArg( kernel, i, sizeof( streams[ i ] ), &streams[ i ] );
-        test_error( error, "Unable to set kernel arguments" );
+        error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+        test_error(error, "Unable to set kernel arguments");
     }
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &streams[ 0 ], 0, NULL, NULL);
-  test_error( error, "Unable to acquire GL obejcts");
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &streams[ 2 ], 0, NULL, NULL);
-  test_error( error, "Unable to acquire GL obejcts");
+    error =
+        (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &streams[0], 0, NULL, NULL);
+    test_error(error, "Unable to acquire GL obejcts");
+    error =
+        (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &streams[2], 0, NULL, NULL);
+    test_error(error, "Unable to acquire GL obejcts");
 
     /* Run the kernel */
     threads[0] = numElements;
 
-    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
-    test_error( error, "Unable to get work group size to use" );
-
-  error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
-    test_error( error, "Unable to execute test kernel" );
-
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &streams[ 0 ], 0, NULL, NULL );
-  test_error(error, "clEnqueueReleaseGLObjects failed");
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &streams[ 2 ], 0, NULL, NULL );
-  test_error(error, "clEnqueueReleaseGLObjects failed");
-
-    // Get the results from both CL and GL and make sure everything looks correct
-    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, bufferSize, outDataCL, 0, NULL, NULL );
-    test_error( error, "Unable to read output CL array!" );
-
-    glBindBuffer( GL_ARRAY_BUFFER, outGLBuffer );
-    void *glMem = glMapBuffer( GL_ARRAY_BUFFER, GL_READ_ONLY );
-    memcpy( outDataGL, glMem, bufferSize );
-    glUnmapBuffer( GL_ARRAY_BUFFER );
-
-    char *inP = (char *)inData, *glP = (char *)outDataGL, *clP = (char *)outDataCL;
+    error = get_max_common_work_group_size(context, kernel, threads[0],
+                                           &localThreads[0]);
+    test_error(error, "Unable to get work group size to use");
+
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to execute test kernel");
+
+    error =
+        (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &streams[0], 0, NULL, NULL);
+    test_error(error, "clEnqueueReleaseGLObjects failed");
+    error =
+        (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &streams[2], 0, NULL, NULL);
+    test_error(error, "clEnqueueReleaseGLObjects failed");
+
+    // Get the results from both CL and GL and make sure everything looks
+    // correct
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, bufferSize,
+                                outDataCL, 0, NULL, NULL);
+    test_error(error, "Unable to read output CL array!");
+
+    glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer);
+    void *glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+    memcpy(outDataGL, glMem, bufferSize);
+    glUnmapBuffer(GL_ARRAY_BUFFER);
+
+    char *inP = (char *)inData, *glP = (char *)outDataGL,
+         *clP = (char *)outDataCL;
     error = 0;
-    for( size_t i = 0; i < numElements * vecSize; i++ )
+    for (size_t i = 0; i < numElements * vecSize; i++)
     {
         cl_long expectedCLValue, expectedGLValue;
-        get_incremented_value( inP, &expectedCLValue, vecType );
-        get_incremented_value( &expectedCLValue, &expectedGLValue, vecType );
+        get_incremented_value(inP, &expectedCLValue, vecType);
+        get_incremented_value(&expectedCLValue, &expectedGLValue, vecType);
 
-        if( memcmp( clP, &expectedCLValue, get_explicit_type_size( vecType ) ) != 0 )
+        if (memcmp(clP, &expectedCLValue, get_explicit_type_size(vecType)) != 0)
         {
-            char scratch[ 64 ];
-            log_error( "ERROR: Data sample %d from the CL output did not validate!\n", (int)i );
-            log_error( "\t   Input: %s\n", GetDataVectorString( inP, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\tExpected: %s\n", GetDataVectorString( &expectedCLValue, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\t  Actual: %s\n", GetDataVectorString( clP, get_explicit_type_size( vecType ), 1, scratch ) );
+            char scratch[64];
+            log_error(
+                "ERROR: Data sample %d from the CL output did not validate!\n",
+                (int)i);
+            log_error("\t   Input: %s\n",
+                      GetDataVectorString(inP, get_explicit_type_size(vecType),
+                                          1, scratch));
+            log_error("\tExpected: %s\n",
+                      GetDataVectorString(&expectedCLValue,
+                                          get_explicit_type_size(vecType), 1,
+                                          scratch));
+            log_error("\t  Actual: %s\n",
+                      GetDataVectorString(clP, get_explicit_type_size(vecType),
+                                          1, scratch));
             error = -1;
         }
 
-        if( memcmp( glP, &expectedGLValue, get_explicit_type_size( vecType ) ) != 0 )
+        if (memcmp(glP, &expectedGLValue, get_explicit_type_size(vecType)) != 0)
         {
-            char scratch[ 64 ];
-            log_error( "ERROR: Data sample %d from the GL output did not validate!\n", (int)i );
-            log_error( "\t   Input: %s\n", GetDataVectorString( inP, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\tExpected: %s\n", GetDataVectorString( &expectedGLValue, get_explicit_type_size( vecType ), 1, scratch ) );
-            log_error( "\t  Actual: %s\n", GetDataVectorString( glP, get_explicit_type_size( vecType ), 1, scratch ) );
+            char scratch[64];
+            log_error(
+                "ERROR: Data sample %d from the GL output did not validate!\n",
+                (int)i);
+            log_error("\t   Input: %s\n",
+                      GetDataVectorString(inP, get_explicit_type_size(vecType),
+                                          1, scratch));
+            log_error("\tExpected: %s\n",
+                      GetDataVectorString(&expectedGLValue,
+                                          get_explicit_type_size(vecType), 1,
+                                          scratch));
+            log_error("\t  Actual: %s\n",
+                      GetDataVectorString(glP, get_explicit_type_size(vecType),
+                                          1, scratch));
             error = -1;
         }
 
-        if( error )
-            return error;
+        if (error) return error;
 
-        inP += get_explicit_type_size( vecType );
-        glP += get_explicit_type_size( vecType );
-        clP += get_explicit_type_size( vecType );
+        inP += get_explicit_type_size(vecType);
+        glP += get_explicit_type_size(vecType);
+        clP += get_explicit_type_size(vecType);
     }
 
-    for(i=0;i<3;i++)
+    for (i = 0; i < 3; i++)
     {
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
+        streams[i].reset();
     }
 
-    glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;
-    glDeleteBuffers(1, &outGLBuffer);    outGLBuffer = 0;
+    glDeleteBuffers(1, &inGLBuffer);
+    inGLBuffer = 0;
+    glDeleteBuffers(1, &outGLBuffer);
+    outGLBuffer = 0;
 
     return 0;
 }
 
-int test_buffers( cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
+int test_buffers(cl_device_id device, cl_context context,
+                 cl_command_queue queue, int numElements)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kNumExplicitTypes };
+    ExplicitType vecType[] = {
+        kChar, kUChar, kShort, kUShort, kInt,
+        kUInt, kLong,  kULong, kFloat,  kNumExplicitTypes
+    };
     unsigned int vecSizes[] = { 1, 2, 4, 8, 16, 0 };
     unsigned int index, typeIndex;
     int retVal = 0;
     RandomSeed seed(gRandomSeed);
 
 
-    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
     {
-        for( index = 0; vecSizes[ index ] != 0; index++ )
+        for (index = 0; vecSizes[index] != 0; index++)
         {
             // Test!
-            if( test_buffer_kernel( context, queue, vecType[ typeIndex ], vecSizes[ index ], numElements, 0, seed) != 0 )
+            if (test_buffer_kernel(context, queue, vecType[typeIndex],
+                                   vecSizes[index], numElements, 0, seed)
+                != 0)
             {
-                char sizeNames[][ 4 ] = { "", "", "2", "", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
-                log_error( "   Buffer test %s%s FAILED\n", get_explicit_type_name( vecType[ typeIndex ] ), sizeNames[ vecSizes[ index ] ] );
+                char sizeNames[][4] = { "", "", "2", "", "4", "", "", "",  "8",
+                                        "", "", "",  "", "",  "", "", "16" };
+                log_error("   Buffer test %s%s FAILED\n",
+                          get_explicit_type_name(vecType[typeIndex]),
+                          sizeNames[vecSizes[index]]);
                 retVal++;
             }
         }
     }
 
     return retVal;
-
 }
 
 
-int test_buffers_getinfo( cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
+int test_buffers_getinfo(cl_device_id device, cl_context context,
+                         cl_command_queue queue, int numElements)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kNumExplicitTypes };
+    ExplicitType vecType[] = {
+        kChar, kUChar, kShort, kUShort, kInt,
+        kUInt, kLong,  kULong, kFloat,  kNumExplicitTypes
+    };
     unsigned int vecSizes[] = { 1, 2, 4, 8, 16, 0 };
     unsigned int index, typeIndex;
     int retVal = 0;
-    RandomSeed seed( gRandomSeed );
+    RandomSeed seed(gRandomSeed);
 
 
-    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
     {
-        for( index = 0; vecSizes[ index ] != 0; index++ )
+        for (index = 0; vecSizes[index] != 0; index++)
         {
             // Test!
-            if( test_buffer_kernel( context, queue, vecType[ typeIndex ], vecSizes[ index ], numElements, 1, seed ) != 0 )
+            if (test_buffer_kernel(context, queue, vecType[typeIndex],
+                                   vecSizes[index], numElements, 1, seed)
+                != 0)
             {
-                char sizeNames[][ 4 ] = { "", "", "2", "", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
-                log_error( "   Buffer test %s%s FAILED\n", get_explicit_type_name( vecType[ typeIndex ] ), sizeNames[ vecSizes[ index ] ] );
+                char sizeNames[][4] = { "", "", "2", "", "4", "", "", "",  "8",
+                                        "", "", "",  "", "",  "", "", "16" };
+                log_error("   Buffer test %s%s FAILED\n",
+                          get_explicit_type_name(vecType[typeIndex]),
+                          sizeNames[vecSizes[index]]);
                 retVal++;
             }
         }
     }
 
     return retVal;
-
 }
-
-
-
diff --git a/test_conformance/gl/test_fence_sync.cpp b/test_conformance/gl/test_fence_sync.cpp
index 00bf2cc90b..35cc62de62 100644
--- a/test_conformance/gl/test_fence_sync.cpp
+++ b/test_conformance/gl/test_fence_sync.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -17,7 +17,7 @@
 #include "gl/setup.h"
 #include "harness/genericThread.h"
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
 #include <OpenGL/glu.h>
 #else
 #include <GL/glu.h>
@@ -40,112 +40,121 @@ typedef struct __GLsync *GLsync;
 #define APIENTRY
 #endif
 
-typedef GLsync (APIENTRY *glFenceSyncPtr)(GLenum condition,GLbitfield flags);
+typedef GLsync(APIENTRY *glFenceSyncPtr)(GLenum condition, GLbitfield flags);
 glFenceSyncPtr glFenceSyncFunc;
 
-typedef bool (APIENTRY *glIsSyncPtr)(GLsync sync);
+typedef bool(APIENTRY *glIsSyncPtr)(GLsync sync);
 glIsSyncPtr glIsSyncFunc;
 
-typedef void (APIENTRY *glDeleteSyncPtr)(GLsync sync);
+typedef void(APIENTRY *glDeleteSyncPtr)(GLsync sync);
 glDeleteSyncPtr glDeleteSyncFunc;
 
-typedef GLenum (APIENTRY *glClientWaitSyncPtr)(GLsync sync,GLbitfield flags,GLuint64 timeout);
+typedef GLenum(APIENTRY *glClientWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                              GLuint64 timeout);
 glClientWaitSyncPtr glClientWaitSyncFunc;
 
-typedef void (APIENTRY *glWaitSyncPtr)(GLsync sync,GLbitfield flags,GLuint64 timeout);
+typedef void(APIENTRY *glWaitSyncPtr)(GLsync sync, GLbitfield flags,
+                                      GLuint64 timeout);
 glWaitSyncPtr glWaitSyncFunc;
 
-typedef void (APIENTRY *glGetInteger64vPtr)(GLenum pname, GLint64 *params);
+typedef void(APIENTRY *glGetInteger64vPtr)(GLenum pname, GLint64 *params);
 glGetInteger64vPtr glGetInteger64vFunc;
 
-typedef void (APIENTRY *glGetSyncivPtr)(GLsync sync,GLenum pname,GLsizei bufSize,GLsizei *length,
-                               GLint *values);
+typedef void(APIENTRY *glGetSyncivPtr)(GLsync sync, GLenum pname,
+                                       GLsizei bufSize, GLsizei *length,
+                                       GLint *values);
 glGetSyncivPtr glGetSyncivFunc;
 
 #define CHK_GL_ERR() printf("%s\n", gluErrorString(glGetError()))
 
-static void InitSyncFns( void )
+static void InitSyncFns(void)
 {
-    glFenceSyncFunc = (glFenceSyncPtr)glutGetProcAddress( "glFenceSync" );
-    glIsSyncFunc = (glIsSyncPtr)glutGetProcAddress( "glIsSync" );
-    glDeleteSyncFunc = (glDeleteSyncPtr)glutGetProcAddress( "glDeleteSync" );
-    glClientWaitSyncFunc = (glClientWaitSyncPtr)glutGetProcAddress( "glClientWaitSync" );
-    glWaitSyncFunc = (glWaitSyncPtr)glutGetProcAddress( "glWaitSync" );
-    glGetInteger64vFunc = (glGetInteger64vPtr)glutGetProcAddress( "glGetInteger64v" );
-    glGetSyncivFunc = (glGetSyncivPtr)glutGetProcAddress( "glGetSynciv" );
+    glFenceSyncFunc = (glFenceSyncPtr)glutGetProcAddress("glFenceSync");
+    glIsSyncFunc = (glIsSyncPtr)glutGetProcAddress("glIsSync");
+    glDeleteSyncFunc = (glDeleteSyncPtr)glutGetProcAddress("glDeleteSync");
+    glClientWaitSyncFunc =
+        (glClientWaitSyncPtr)glutGetProcAddress("glClientWaitSync");
+    glWaitSyncFunc = (glWaitSyncPtr)glutGetProcAddress("glWaitSync");
+    glGetInteger64vFunc =
+        (glGetInteger64vPtr)glutGetProcAddress("glGetInteger64v");
+    glGetSyncivFunc = (glGetSyncivPtr)glutGetProcAddress("glGetSynciv");
 }
 #ifndef GL_ARB_sync
-#define GL_MAX_SERVER_WAIT_TIMEOUT        0x9111
+#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111
 
-#define GL_OBJECT_TYPE            0x9112
-#define GL_SYNC_CONDITION            0x9113
-#define GL_SYNC_STATUS            0x9114
-#define GL_SYNC_FLAGS            0x9115
+#define GL_OBJECT_TYPE 0x9112
+#define GL_SYNC_CONDITION 0x9113
+#define GL_SYNC_STATUS 0x9114
+#define GL_SYNC_FLAGS 0x9115
 
-#define GL_SYNC_FENCE            0x9116
+#define GL_SYNC_FENCE 0x9116
 
-#define GL_SYNC_GPU_COMMANDS_COMPLETE    0x9117
+#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117
 
-#define GL_UNSIGNALED            0x9118
-#define GL_SIGNALED            0x9119
+#define GL_UNSIGNALED 0x9118
+#define GL_SIGNALED 0x9119
 
-#define GL_SYNC_FLUSH_COMMANDS_BIT        0x00000001
+#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001
 
-#define GL_TIMEOUT_IGNORED            0xFFFFFFFFFFFFFFFFull
+#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFFull
 
-#define GL_ALREADY_SIGNALED        0x911A
-#define GL_TIMEOUT_EXPIRED            0x911B
-#define GL_CONDITION_SATISFIED        0x911C
-#define GL_WAIT_FAILED            0x911D
+#define GL_ALREADY_SIGNALED 0x911A
+#define GL_TIMEOUT_EXPIRED 0x911B
+#define GL_CONDITION_SATISFIED 0x911C
+#define GL_WAIT_FAILED 0x911D
 #endif
 
 #define USING_ARB_sync 1
 #endif
 
-typedef cl_event (CL_API_CALL *clCreateEventFromGLsyncKHR_fn)( cl_context context, GLsync sync, cl_int *errCode_ret) ;
+typedef cl_event(CL_API_CALL *clCreateEventFromGLsyncKHR_fn)(
+    cl_context context, GLsync sync, cl_int *errCode_ret);
 
 clCreateEventFromGLsyncKHR_fn clCreateEventFromGLsyncKHR_ptr;
 
 
 static const char *updateBuffersKernel[] = {
-    "__kernel void update( __global float4 * vertices, __global float4 *colors, int horizWrap, int rowIdx )\n"
+    "__kernel void update( __global float4 * vertices, __global float4 "
+    "*colors, int horizWrap, int rowIdx )\n"
     "{\n"
     "    size_t tid = get_global_id(0);\n"
     "\n"
     "    size_t xVal = ( tid & ( horizWrap - 1 ) );\n"
     "    vertices[ tid * 2 + 0 ] = (float4)( xVal, rowIdx*16.f, 0.0f, 1.f );\n"
-    "    vertices[ tid * 2 + 1 ] = (float4)( xVal, rowIdx*16.f + 4.0f, 0.0f, 1.f );\n"
+    "    vertices[ tid * 2 + 1 ] = (float4)( xVal, rowIdx*16.f + 4.0f, 0.0f, "
+    "1.f );\n"
     "\n"
     "    int rowV = rowIdx + 1;\n"
-    "    colors[ tid * 2 + 0 ] = (float4)( ( rowV & 1 ) / 255.f, ( ( rowV & 2 ) >> 1 ) / 255.f, ( ( rowV & 4 ) >> 2 ) / 255.f, 1.f );\n"
-    "    //colors[ tid * 2 + 0 ] = (float4)( (float)xVal/(float)horizWrap, 1.0f, 1.0f, 1.0f );\n"
+    "    colors[ tid * 2 + 0 ] = (float4)( ( rowV & 1 ) / 255.f, ( ( rowV & 2 "
+    ") >> 1 ) / 255.f, ( ( rowV & 4 ) >> 2 ) / 255.f, 1.f );\n"
+    "    //colors[ tid * 2 + 0 ] = (float4)( (float)xVal/(float)horizWrap, "
+    "1.0f, 1.0f, 1.0f );\n"
     "    colors[ tid * 2 + 1 ] = colors[ tid * 2 + 0 ];\n"
-    "}\n" };
-
-//Passthrough VertexShader
-static const char *vertexshader =
-"#version 150\n"
-"uniform mat4 projMatrix;\n"
-"in vec4 inPosition;\n"
-"in vec4 inColor;\n"
-"out vec4 vertColor;\n"
-"void main (void) {\n"
-"    gl_Position = projMatrix*inPosition;\n"
-"   vertColor = inColor;\n"
-"}\n";
-
-//Passthrough FragmentShader
-static const char *fragmentshader =
-"#version 150\n"
-"in vec4 vertColor;\n"
-"out vec4 outColor;\n"
-"void main (void) {\n"
-"    outColor = vertColor;\n"
-"}\n";
+    "}\n"
+};
+
+// Passthrough VertexShader
+static const char *vertexshader = "#version 150\n"
+                                  "uniform mat4 projMatrix;\n"
+                                  "in vec4 inPosition;\n"
+                                  "in vec4 inColor;\n"
+                                  "out vec4 vertColor;\n"
+                                  "void main (void) {\n"
+                                  "    gl_Position = projMatrix*inPosition;\n"
+                                  "   vertColor = inColor;\n"
+                                  "}\n";
+
+// Passthrough FragmentShader
+static const char *fragmentshader = "#version 150\n"
+                                    "in vec4 vertColor;\n"
+                                    "out vec4 outColor;\n"
+                                    "void main (void) {\n"
+                                    "    outColor = vertColor;\n"
+                                    "}\n";
 
 GLuint createShaderProgram(GLint *posLoc, GLint *colLoc)
 {
-    GLint  logLength, status;
+    GLint logLength, status;
     GLuint program = glCreateProgram();
     GLuint vpShader;
 
@@ -153,8 +162,9 @@ GLuint createShaderProgram(GLint *posLoc, GLint *colLoc)
     glShaderSource(vpShader, 1, (const GLchar **)&vertexshader, NULL);
     glCompileShader(vpShader);
     glGetShaderiv(vpShader, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*) malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetShaderInfoLog(vpShader, logLength, &logLength, log);
         log_info("Vtx Shader compile log:\n%s", log);
         free(log);
@@ -175,8 +185,9 @@ GLuint createShaderProgram(GLint *posLoc, GLint *colLoc)
     glCompileShader(fpShader);
 
     glGetShaderiv(fpShader, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*)malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetShaderInfoLog(fpShader, logLength, &logLength, log);
         log_info("Frag Shader compile log:\n%s", log);
         free(log);
@@ -192,8 +203,9 @@ GLuint createShaderProgram(GLint *posLoc, GLint *colLoc)
 
     glLinkProgram(program);
     glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*)malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetProgramInfoLog(program, logLength, &logLength, log);
         log_info("Program link log:\n%s", log);
         free(log);
@@ -219,7 +231,7 @@ void destroyShaderProgram(GLuint program)
     glUseProgram(0);
     glGetAttachedShaders(program, 2, &count, shaders);
     int i;
-    for(i = 0; i < count; i++)
+    for (i = 0; i < count; i++)
     {
         glDetachShader(program, shaders[i]);
         glDeleteShader(shaders[i]);
@@ -227,44 +239,49 @@ void destroyShaderProgram(GLuint program)
     glDeleteProgram(program);
 }
 
-// This function queues up and runs the above CL kernel that writes the vertex data
-cl_int run_cl_kernel( cl_kernel kernel, cl_command_queue queue, cl_mem stream0, cl_mem stream1,
-                     cl_int rowIdx, cl_event fenceEvent, size_t numThreads )
+// This function queues up and runs the above CL kernel that writes the vertex
+// data
+cl_int run_cl_kernel(cl_kernel kernel, cl_command_queue queue, cl_mem stream0,
+                     cl_mem stream1, cl_int rowIdx, cl_event fenceEvent,
+                     size_t numThreads)
 {
-    cl_int error = clSetKernelArg( kernel, 3, sizeof( rowIdx ), &rowIdx );
-    test_error( error, "Unable to set kernel arguments" );
+    cl_int error = clSetKernelArg(kernel, 3, sizeof(rowIdx), &rowIdx);
+    test_error(error, "Unable to set kernel arguments");
 
     clEventWrapper acqEvent1, acqEvent2, kernEvent, relEvent1, relEvent2;
-    int numEvents = ( fenceEvent != NULL ) ? 1 : 0;
-    cl_event *fence_evt = ( fenceEvent != NULL ) ? &fenceEvent : NULL;
+    int numEvents = (fenceEvent != NULL) ? 1 : 0;
+    cl_event *fence_evt = (fenceEvent != NULL) ? &fenceEvent : NULL;
 
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &stream0, numEvents, fence_evt, &acqEvent1 );
-    test_error( error, "Unable to acquire GL obejcts");
-    error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &stream1, numEvents, fence_evt, &acqEvent2 );
-    test_error( error, "Unable to acquire GL obejcts");
+    error = (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &stream0, numEvents,
+                                             fence_evt, &acqEvent1);
+    test_error(error, "Unable to acquire GL obejcts");
+    error = (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &stream1, numEvents,
+                                             fence_evt, &acqEvent2);
+    test_error(error, "Unable to acquire GL obejcts");
 
-    cl_event evts[ 2 ] = { acqEvent1, acqEvent2 };
+    cl_event evts[2] = { acqEvent1, acqEvent2 };
 
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numThreads, NULL, 2, evts, &kernEvent );
-    test_error( error, "Unable to execute test kernel" );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &numThreads, NULL, 2,
+                                   evts, &kernEvent);
+    test_error(error, "Unable to execute test kernel");
 
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &stream0, 1, &kernEvent, &relEvent1 );
+    error = (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &stream0, 1, &kernEvent,
+                                             &relEvent1);
     test_error(error, "clEnqueueReleaseGLObjects failed");
-    error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &stream1, 1, &kernEvent, &relEvent2 );
+    error = (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &stream1, 1, &kernEvent,
+                                             &relEvent2);
     test_error(error, "clEnqueueReleaseGLObjects failed");
 
-    evts[ 0 ] = relEvent1;
-    evts[ 1 ] = relEvent2;
-    error = clWaitForEvents( 2, evts );
-    test_error( error, "Unable to wait for release events" );
+    evts[0] = relEvent1;
+    evts[1] = relEvent2;
+    error = clWaitForEvents(2, evts);
+    test_error(error, "Unable to wait for release events");
 
     return 0;
 }
 
-class RunThread : public genericThread
-{
+class RunThread : public genericThread {
 public:
-
     cl_kernel mKernel;
     cl_command_queue mQueue;
     cl_mem mStream0, mStream1;
@@ -272,34 +289,40 @@ class RunThread : public genericThread
     cl_event mFenceEvent;
     size_t mNumThreads;
 
-    RunThread( cl_kernel kernel, cl_command_queue queue, cl_mem stream0, cl_mem stream1, size_t numThreads )
-    : mKernel( kernel ), mQueue( queue ), mStream0( stream0 ), mStream1( stream1 ), mNumThreads( numThreads )
-    {
-    }
+    RunThread(cl_kernel kernel, cl_command_queue queue, cl_mem stream0,
+              cl_mem stream1, size_t numThreads)
+        : mKernel(kernel), mQueue(queue), mStream0(stream0), mStream1(stream1),
+          mNumThreads(numThreads)
+    {}
 
-    void SetRunData( cl_int rowIdx, cl_event fenceEvent )
+    void SetRunData(cl_int rowIdx, cl_event fenceEvent)
     {
         mRowIdx = rowIdx;
         mFenceEvent = fenceEvent;
     }
 
-    virtual void * IRun( void )
+    virtual void *IRun(void)
     {
-        cl_int error = run_cl_kernel( mKernel, mQueue, mStream0, mStream1, mRowIdx, mFenceEvent, mNumThreads );
+        cl_int error = run_cl_kernel(mKernel, mQueue, mStream0, mStream1,
+                                     mRowIdx, mFenceEvent, mNumThreads);
         return (void *)(uintptr_t)error;
     }
 };
 
 
-int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_queue queue, bool separateThreads, GLint rend_vs, GLint read_vs, cl_device_id rend_device )
+int test_fence_sync_single(cl_device_id device, cl_context context,
+                           cl_command_queue queue, bool separateThreads,
+                           GLint rend_vs, GLint read_vs,
+                           cl_device_id rend_device)
 {
     int error;
     const int framebufferSize = 512;
 
 
-    if( !is_extension_available( device, "cl_khr_gl_event" ) )
+    if (!is_extension_available(device, "cl_khr_gl_event"))
     {
-        log_info( "NOTE: cl_khr_gl_event extension not present on this device; skipping fence sync test\n" );
+        log_info("NOTE: cl_khr_gl_event extension not present on this device; "
+                 "skipping fence sync test\n");
         return 0;
     }
 
@@ -312,10 +335,11 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
     clGetPlatformIDs(0, NULL, &nplatforms);
     clGetPlatformIDs(1, &platform, NULL);
 
-    if (nplatforms > 1) {
+    if (nplatforms > 1)
+    {
         log_info("clGetPlatformIDs returned multiple values.  This is not "
-            "an error, but might result in obtaining incorrect function "
-            "pointers if you do not want the first returned platform.\n");
+                 "an error, but might result in obtaining incorrect function "
+                 "pointers if you do not want the first returned platform.\n");
 
         // Show them the platform name, in case it is a problem.
 
@@ -323,28 +347,35 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
         char *name;
 
         clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &size);
-        name = (char*)malloc(size);
+        name = (char *)malloc(size);
         clGetPlatformInfo(platform, CL_PLATFORM_NAME, size, name, NULL);
 
         log_info("Using platform with name: %s \n", name);
         free(name);
     }
 
-    clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(platform, "clCreateEventFromGLsyncKHR");
-    if( clCreateEventFromGLsyncKHR_ptr == NULL )
+    clCreateEventFromGLsyncKHR_ptr =
+        (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(
+            platform, "clCreateEventFromGLsyncKHR");
+    if (clCreateEventFromGLsyncKHR_ptr == NULL)
     {
-        log_error( "ERROR: Unable to run fence_sync test (clCreateEventFromGLsyncKHR function not discovered!)\n" );
-        clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(platform, "clCreateEventFromGLsyncAPPLE");
+        log_error("ERROR: Unable to run fence_sync test "
+                  "(clCreateEventFromGLsyncKHR function not discovered!)\n");
+        clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(
+                platform, "clCreateEventFromGLsyncAPPLE");
         return -1;
     }
 
 #ifdef USING_ARB_sync
-    char *gl_version_str = (char*)glGetString( GL_VERSION );
+    char *gl_version_str = (char *)glGetString(GL_VERSION);
     float glCoreVersion;
     sscanf(gl_version_str, "%f", &glCoreVersion);
-    if( glCoreVersion < 3.0f )
+    if (glCoreVersion < 3.0f)
     {
-        log_info( "OpenGL version %f does not support fence/sync! Skipping test.\n", glCoreVersion );
+        log_info(
+            "OpenGL version %f does not support fence/sync! Skipping test.\n",
+            glCoreVersion);
         return 0;
     }
 
@@ -354,10 +385,13 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
     GLint val, screen;
     CGLGetVirtualScreen(currCtx, &screen);
     CGLDescribePixelFormat(pixFmt, screen, kCGLPFAOpenGLProfile, &val);
-    if(val != kCGLOGLPVersion_3_2_Core)
+    if (val != kCGLOGLPVersion_3_2_Core)
     {
-        log_error( "OpenGL context was not created with OpenGL version >= 3.0 profile even though platform supports it"
-                  "OpenGL profile %f does not support fence/sync! Skipping test.\n", glCoreVersion );
+        log_error(
+            "OpenGL context was not created with OpenGL version >= 3.0 profile "
+            "even though platform supports it"
+            "OpenGL profile %f does not support fence/sync! Skipping test.\n",
+            glCoreVersion);
         return -1;
     }
 #else
@@ -365,7 +399,7 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
     HDC hdc = wglGetCurrentDC();
     HGLRC hglrc = wglGetCurrentContext();
 #else
-    Display* dpy = glXGetCurrentDisplay();
+    Display *dpy = glXGetCurrentDisplay();
     GLXDrawable drawable = glXGetCurrentDrawable();
     GLXContext ctx = glXGetCurrentContext();
 #endif
@@ -386,51 +420,66 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
 
     GLint posLoc, colLoc;
     GLuint shaderprogram = createShaderProgram(&posLoc, &colLoc);
-    if(!shaderprogram)
+    if (!shaderprogram)
     {
         log_error("Failed to create shader program\n");
         return -1;
     }
 
-    float l = 0.0f; float r = framebufferSize;
-    float b = 0.0f; float t = framebufferSize;
-
-    float projMatrix[16] = { 2.0f/(r-l), 0.0f, 0.0f, 0.0f,
-        0.0f, 2.0f/(t-b), 0.0f, 0.0f,
-        0.0f, 0.0f, -1.0f, 0.0f,
-        -(r+l)/(r-l), -(t+b)/(t-b), 0.0f, 1.0f
-    };
+    float l = 0.0f;
+    float r = framebufferSize;
+    float b = 0.0f;
+    float t = framebufferSize;
+
+    float projMatrix[16] = { 2.0f / (r - l),
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             2.0f / (t - b),
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             0.0f,
+                             -1.0f,
+                             0.0f,
+                             -(r + l) / (r - l),
+                             -(t + b) / (t - b),
+                             0.0f,
+                             1.0f };
 
     glUseProgram(shaderprogram);
     GLuint projMatLoc = glGetUniformLocation(shaderprogram, "projMatrix");
     glUniformMatrix4fv(projMatLoc, 1, 0, projMatrix);
     glUseProgram(0);
 
-    // Note: the framebuffer is just the target to verify our results against, so we don't
-    // really care to go through all the possible formats in this case
+    // Note: the framebuffer is just the target to verify our results against,
+    // so we don't really care to go through all the possible formats in this
+    // case
     glFramebufferWrapper glFramebuffer;
     glRenderbufferWrapper glRenderbuffer;
-    error = CreateGLRenderbufferRaw( framebufferSize, 128, GL_COLOR_ATTACHMENT0_EXT,
-                                    GL_RGBA, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV,
-                                    &glFramebuffer, &glRenderbuffer );
-    if( error != 0 )
-        return error;
+    error = CreateGLRenderbufferRaw(
+        framebufferSize, 128, GL_COLOR_ATTACHMENT0_EXT, GL_RGBA, GL_RGBA,
+        GL_UNSIGNED_INT_8_8_8_8_REV, &glFramebuffer, &glRenderbuffer);
+    if (error != 0) return error;
 
     GLuint vao;
     glGenVertexArrays(1, &vao);
     glBindVertexArray(vao);
 
     glBufferWrapper vtxBuffer, colorBuffer;
-    glGenBuffers( 1, &vtxBuffer );
-    glGenBuffers( 1, &colorBuffer );
+    glGenBuffers(1, &vtxBuffer);
+    glGenBuffers(1, &colorBuffer);
 
-    const int numHorizVertices = ( framebufferSize * 64 ) + 1;
+    const int numHorizVertices = (framebufferSize * 64) + 1;
 
-    glBindBuffer( GL_ARRAY_BUFFER, vtxBuffer );
-    glBufferData( GL_ARRAY_BUFFER, sizeof( GLfloat ) * numHorizVertices * 2 * 4, NULL, GL_STATIC_DRAW );
+    glBindBuffer(GL_ARRAY_BUFFER, vtxBuffer);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat) * numHorizVertices * 2 * 4,
+                 NULL, GL_STATIC_DRAW);
 
-    glBindBuffer( GL_ARRAY_BUFFER, colorBuffer );
-    glBufferData( GL_ARRAY_BUFFER, sizeof( GLfloat ) * numHorizVertices * 2 * 4, NULL, GL_STATIC_DRAW );
+    glBindBuffer(GL_ARRAY_BUFFER, colorBuffer);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat) * numHorizVertices * 2 * 4,
+                 NULL, GL_STATIC_DRAW);
 
     // Now that the requisite objects are bound, we can attempt program
     // validation:
@@ -439,8 +488,9 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
 
     GLint logLength, status;
     glGetProgramiv(shaderprogram, GL_INFO_LOG_LENGTH, &logLength);
-    if (logLength > 0) {
-        GLchar *log = (GLchar*)malloc(logLength);
+    if (logLength > 0)
+    {
+        GLchar *log = (GLchar *)malloc(logLength);
         glGetProgramInfoLog(shaderprogram, logLength, &logLength, log);
         log_info("Program validate log:\n%s", log);
         free(log);
@@ -455,125 +505,131 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
 
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper streams[ 2 ];
+    clMemWrapper streams[2];
 
-    if( create_single_kernel_helper( context, &program, &kernel, 1, updateBuffersKernel, "update" ) )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    updateBuffersKernel, "update"))
         return -1;
 
-    streams[ 0 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_WRITE, vtxBuffer, &error );
-    test_error( error, "Unable to create CL buffer from GL vertex buffer" );
+    streams[0] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_WRITE,
+                                             vtxBuffer, &error);
+    test_error(error, "Unable to create CL buffer from GL vertex buffer");
 
-    streams[ 1 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_WRITE, colorBuffer, &error );
-    test_error( error, "Unable to create CL buffer from GL color buffer" );
+    streams[1] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_WRITE,
+                                             colorBuffer, &error);
+    test_error(error, "Unable to create CL buffer from GL color buffer");
 
-    error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel arguments");
 
-    error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[ 1 ] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel arguments");
 
     cl_int horizWrap = (cl_int)framebufferSize;
-    error = clSetKernelArg( kernel, 2, sizeof( horizWrap ), &horizWrap );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 2, sizeof(horizWrap), &horizWrap);
+    test_error(error, "Unable to set kernel arguments");
 
-    glViewport( 0, 0, framebufferSize, framebufferSize );
-    glClearColor( 0, 0, 0, 0 );
-    glClear( GL_COLOR_BUFFER_BIT );
-    glClear( GL_DEPTH_BUFFER_BIT );
-    glDisable( GL_DEPTH_TEST );
-    glEnable( GL_BLEND );
-    glBlendFunc( GL_ONE, GL_ONE );
+    glViewport(0, 0, framebufferSize, framebufferSize);
+    glClearColor(0, 0, 0, 0);
+    glClear(GL_COLOR_BUFFER_BIT);
+    glClear(GL_DEPTH_BUFFER_BIT);
+    glDisable(GL_DEPTH_TEST);
+    glEnable(GL_BLEND);
+    glBlendFunc(GL_ONE, GL_ONE);
 
     clEventWrapper fenceEvent;
     GLsync glFence = 0;
 
     // Do a loop through 8 different horizontal stripes against the framebuffer
-    RunThread thread( kernel, queue, streams[ 0 ], streams[ 1 ], (size_t)numHorizVertices );
+    RunThread thread(kernel, queue, streams[0], streams[1],
+                     (size_t)numHorizVertices);
 
-    for( int i = 0; i < 8; i++ )
+    for (int i = 0; i < 8; i++)
     {
         // if current rendering device is not the compute device and
         // separateThreads == false which means compute is going on same
         // thread and we are using implicit synchronization (no GLSync obj used)
-        // then glFlush by clEnqueueAcquireGLObject is not sufficient ... we need
-        // to wait for rendering to finish on other device before CL can start
-        // writing to CL/GL shared mem objects. When separateThreads is true i.e.
-        // we are using GLSync obj to synchronize then we dont need to call glFinish
-        // here since CL should wait for rendering on other device before this
-        // GLSync object to finish before it starts writing to shared mem object.
-        // Also rend_device == compute_device no need to call glFinish
-        if(rend_device != device && !separateThreads)
-            glFinish();
-
-        if( separateThreads )
+        // then glFlush by clEnqueueAcquireGLObject is not sufficient ... we
+        // need to wait for rendering to finish on other device before CL can
+        // start writing to CL/GL shared mem objects. When separateThreads is
+        // true i.e. we are using GLSync obj to synchronize then we dont need to
+        // call glFinish here since CL should wait for rendering on other device
+        // before this GLSync object to finish before it starts writing to
+        // shared mem object. Also rend_device == compute_device no need to call
+        // glFinish
+        if (rend_device != device && !separateThreads) glFinish();
+
+        if (separateThreads)
         {
-            if (fenceEvent != NULL)
-            {
-                clReleaseEvent(fenceEvent);
-                glDeleteSyncFunc(glFence);
-            }
+            glDeleteSyncFunc(glFence);
 
             glFence = glFenceSyncFunc(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-            fenceEvent = clCreateEventFromGLsyncKHR_ptr(context, glFence, &error);
+            fenceEvent =
+                clCreateEventFromGLsyncKHR_ptr(context, glFence, &error);
             test_error(error, "Unable to create CL event from GL fence");
 
-            // in case of explicit synchronization, we just wait for the sync object to complete
-            // in clEnqueueAcquireGLObject but we dont flush. Its application's responsibility
-            // to flush on the context on which glSync is created
+            // in case of explicit synchronization, we just wait for the sync
+            // object to complete in clEnqueueAcquireGLObject but we dont flush.
+            // Its application's responsibility to flush on the context on which
+            // glSync is created
             glFlush();
 
-            thread.SetRunData( (cl_int)i, fenceEvent );
+            thread.SetRunData((cl_int)i, fenceEvent);
             thread.Start();
 
             error = (cl_int)(size_t)thread.Join();
         }
         else
         {
-            error = run_cl_kernel( kernel, queue, streams[ 0 ], streams[ 1 ], (cl_int)i, fenceEvent, (size_t)numHorizVertices );
+            error =
+                run_cl_kernel(kernel, queue, streams[0], streams[1], (cl_int)i,
+                              fenceEvent, (size_t)numHorizVertices);
         }
-        test_error( error, "Unable to run CL kernel" );
+        test_error(error, "Unable to run CL kernel");
 
         glUseProgram(shaderprogram);
         glEnableVertexAttribArray(posLoc);
         glEnableVertexAttribArray(colLoc);
-        glBindBuffer( GL_ARRAY_BUFFER, vtxBuffer );
-        glVertexAttribPointer(posLoc, 4, GL_FLOAT, GL_FALSE, 4*sizeof(GLfloat), 0);
-        glBindBuffer( GL_ARRAY_BUFFER, colorBuffer );
-        glVertexAttribPointer(colLoc, 4, GL_FLOAT, GL_FALSE, 4*sizeof(GLfloat), 0);
-        glBindBuffer( GL_ARRAY_BUFFER, 0 );
+        glBindBuffer(GL_ARRAY_BUFFER, vtxBuffer);
+        glVertexAttribPointer(posLoc, 4, GL_FLOAT, GL_FALSE,
+                              4 * sizeof(GLfloat), 0);
+        glBindBuffer(GL_ARRAY_BUFFER, colorBuffer);
+        glVertexAttribPointer(colLoc, 4, GL_FLOAT, GL_FALSE,
+                              4 * sizeof(GLfloat), 0);
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
 
-        glDrawArrays( GL_TRIANGLE_STRIP, 0, numHorizVertices * 2 );
+        glDrawArrays(GL_TRIANGLE_STRIP, 0, numHorizVertices * 2);
 
         glDisableVertexAttribArray(posLoc);
         glDisableVertexAttribArray(colLoc);
         glUseProgram(0);
 
-        if( separateThreads )
+        if (separateThreads)
         {
-            // If we're on the same thread, then we're testing implicit syncing, so we
-            // don't need the actual fence code
-            if( fenceEvent != NULL )
-            {
-                clReleaseEvent( fenceEvent );
-                glDeleteSyncFunc( glFence );
-            }
+            // If we're on the same thread, then we're testing implicit syncing,
+            // so we don't need the actual fence code
+            glDeleteSyncFunc(glFence);
+
 
-            glFence = glFenceSyncFunc( GL_SYNC_GPU_COMMANDS_COMPLETE, 0 );
-            fenceEvent = clCreateEventFromGLsyncKHR_ptr( context, glFence, &error );
-            test_error( error, "Unable to create CL event from GL fence" );
+            glFence = glFenceSyncFunc(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+            fenceEvent =
+                clCreateEventFromGLsyncKHR_ptr(context, glFence, &error);
+            test_error(error, "Unable to create CL event from GL fence");
 
-            // in case of explicit synchronization, we just wait for the sync object to complete
-            // in clEnqueueAcquireGLObject but we dont flush. Its application's responsibility
-            // to flush on the context on which glSync is created
+            // in case of explicit synchronization, we just wait for the sync
+            // object to complete in clEnqueueAcquireGLObject but we dont flush.
+            // Its application's responsibility to flush on the context on which
+            // glSync is created
             glFlush();
         }
         else
             glFinish();
     }
 
-    if( glFence != 0 )
-        // Don't need the final release for fenceEvent, because the wrapper will take care of that
-        glDeleteSyncFunc( glFence );
+    if (glFence != 0)
+        // Don't need the final release for fenceEvent, because the wrapper will
+        // take care of that
+        glDeleteSyncFunc(glFence);
 
 #ifdef __APPLE__
     CGLSetVirtualScreen(CGLGetCurrentContext(), read_vs);
@@ -585,54 +641,62 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
 #endif
 #endif
     // Grab the contents of the final framebuffer
-    BufferOwningPtr<char> resultData( ReadGLRenderbuffer( glFramebuffer, glRenderbuffer,
-                                                         GL_COLOR_ATTACHMENT0_EXT,
-                                                         GL_RGBA, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, kUChar,
-                                                         framebufferSize, 128 ) );
-
-    // Check the contents now. We should end up with solid color bands 32 pixels high and the
-    // full width of the framebuffer, at values (128,128,128) due to the additive blending
-    for( int i = 0; i < 8; i++ )
+    BufferOwningPtr<char> resultData(ReadGLRenderbuffer(
+        glFramebuffer, glRenderbuffer, GL_COLOR_ATTACHMENT0_EXT, GL_RGBA,
+        GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, kUChar, framebufferSize, 128));
+
+    // Check the contents now. We should end up with solid color bands 32 pixels
+    // high and the full width of the framebuffer, at values (128,128,128) due
+    // to the additive blending
+    for (int i = 0; i < 8; i++)
     {
-        for( int y = 0; y < 4; y++ )
+        for (int y = 0; y < 4; y++)
         {
-            // Note: coverage will be double because the 63-0 triangle overwrites again at the end of the pass
-            cl_uchar valA = ( ( ( i + 1 ) & 1 )      ) * numHorizVertices * 2 / framebufferSize;
-            cl_uchar valB = ( ( ( i + 1 ) & 2 ) >> 1 ) * numHorizVertices * 2 / framebufferSize;
-            cl_uchar valC = ( ( ( i + 1 ) & 4 ) >> 2 ) * numHorizVertices * 2 / framebufferSize;
-
-            cl_uchar *row = (cl_uchar *)&resultData[ ( i * 16 + y ) * framebufferSize * 4 ];
-            for( int x = 0; x < ( framebufferSize - 1 ) - 1; x++ )
+            // Note: coverage will be double because the 63-0 triangle
+            // overwrites again at the end of the pass
+            cl_uchar valA =
+                (((i + 1) & 1)) * numHorizVertices * 2 / framebufferSize;
+            cl_uchar valB =
+                (((i + 1) & 2) >> 1) * numHorizVertices * 2 / framebufferSize;
+            cl_uchar valC =
+                (((i + 1) & 4) >> 2) * numHorizVertices * 2 / framebufferSize;
+
+            cl_uchar *row =
+                (cl_uchar *)&resultData[(i * 16 + y) * framebufferSize * 4];
+            for (int x = 0; x < (framebufferSize - 1) - 1; x++)
             {
-                if( ( row[ x * 4 ] != valA ) || ( row[ x * 4 + 1 ] != valB ) ||
-                   ( row[ x * 4 + 2 ] != valC ) )
+                if ((row[x * 4] != valA) || (row[x * 4 + 1] != valB)
+                    || (row[x * 4 + 2] != valC))
                 {
-                    log_error( "ERROR: Output framebuffer did not validate!\n" );
-                    DumpGLBuffer( GL_UNSIGNED_BYTE, framebufferSize, 128, resultData );
-                    log_error( "RUNS:\n" );
+                    log_error("ERROR: Output framebuffer did not validate!\n");
+                    DumpGLBuffer(GL_UNSIGNED_BYTE, framebufferSize, 128,
+                                 resultData);
+                    log_error("RUNS:\n");
                     uint32_t *p = (uint32_t *)(char *)resultData;
                     size_t a = 0;
-                    for( size_t t = 1; t < framebufferSize * framebufferSize; t++ )
+                    for (size_t t = 1; t < framebufferSize * framebufferSize;
+                         t++)
                     {
-                        if( p[ a ] != 0 )
+                        if (p[a] != 0)
                         {
-                            if( p[ t ] == 0 )
+                            if (p[t] == 0)
                             {
-                                log_error( "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n", a, t - 1,
-                                          (int)( a % framebufferSize ), (int)( a / framebufferSize ),
-                                          (int)( ( t - 1 ) % framebufferSize ), (int)( ( t - 1 ) / framebufferSize ),
-                                          p[ a ] );
+                                log_error(
+                                    "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n",
+                                    a, t - 1, (int)(a % framebufferSize),
+                                    (int)(a / framebufferSize),
+                                    (int)((t - 1) % framebufferSize),
+                                    (int)((t - 1) / framebufferSize), p[a]);
                                 a = t;
                             }
                         }
                         else
                         {
-                            if( p[ t ] != 0 )
+                            if (p[t] != 0)
                             {
                                 a = t;
                             }
                         }
-
                     }
                     return -1;
                 }
@@ -645,46 +709,56 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_
     return 0;
 }
 
-int test_fence_sync( cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
+int test_fence_sync(cl_device_id device, cl_context context,
+                    cl_command_queue queue, int numElements)
 {
     GLint vs_count = 0;
     cl_device_id *device_list = NULL;
 
-    if( !is_extension_available( device, "cl_khr_gl_event" ) )
+    if (!is_extension_available(device, "cl_khr_gl_event"))
     {
-        log_info( "NOTE: cl_khr_gl_event extension not present on this device; skipping fence sync test\n" );
+        log_info("NOTE: cl_khr_gl_event extension not present on this device; "
+                 "skipping fence sync test\n");
         return 0;
     }
 #ifdef __APPLE__
     CGLContextObj ctx = CGLGetCurrentContext();
     CGLPixelFormatObj pix = CGLGetPixelFormat(ctx);
-    CGLError err = CGLDescribePixelFormat(pix, 0, kCGLPFAVirtualScreenCount, &vs_count);
+    CGLError err =
+        CGLDescribePixelFormat(pix, 0, kCGLPFAVirtualScreenCount, &vs_count);
 
-    device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*vs_count);
-    clGetGLContextInfoAPPLE(context, ctx, CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, sizeof(cl_device_id)*vs_count, device_list, NULL);
+    device_list = (cl_device_id *)malloc(sizeof(cl_device_id) * vs_count);
+    clGetGLContextInfoAPPLE(context, ctx,
+                            CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE,
+                            sizeof(cl_device_id) * vs_count, device_list, NULL);
 #else
-    // Need platform specific way of getting devices from CL context to which OpenGL can render
-    // If not available it can be replaced with clGetContextInfo with CL_CONTEXT_DEVICES
+    // Need platform specific way of getting devices from CL context to which
+    // OpenGL can render If not available it can be replaced with
+    // clGetContextInfo with CL_CONTEXT_DEVICES
     size_t device_cb;
-    cl_int err = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &device_cb);
-    if( err != CL_SUCCESS )
+    cl_int err =
+        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_cb);
+    if (err != CL_SUCCESS)
     {
-      print_error( err, "Unable to get device count from context" );
-      return -1;
+        print_error(err, "Unable to get device count from context");
+        return -1;
     }
     vs_count = (GLint)device_cb / sizeof(cl_device_id);
 
-    if (vs_count < 1) {
-      log_error("No devices found.\n");
-      return -1;
+    if (vs_count < 1)
+    {
+        log_error("No devices found.\n");
+        return -1;
     }
 
-    device_list = (cl_device_id *) malloc(device_cb);
-    err = clGetContextInfo( context, CL_CONTEXT_DEVICES, device_cb, device_list, NULL);
-    if( err != CL_SUCCESS ) {
-      free(device_list);
-      print_error( err, "Unable to get device list from context" );
-      return -1;
+    device_list = (cl_device_id *)malloc(device_cb);
+    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, device_cb, device_list,
+                           NULL);
+    if (err != CL_SUCCESS)
+    {
+        free(device_list);
+        print_error(err, "Unable to get device list from context");
+        return -1;
     }
 
 #endif
@@ -695,30 +769,38 @@ int test_fence_sync( cl_device_id device, cl_context context, cl_command_queue q
 
     // Loop through all the devices capable to OpenGL rendering
     // and set them as current rendering target
-    for(rend_vs = 0; rend_vs < vs_count; rend_vs++)
+    for (rend_vs = 0; rend_vs < vs_count; rend_vs++)
     {
         // Loop through all the devices and set them as current
         // compute target
-        for(read_vs = 0; read_vs < vs_count; read_vs++)
+        for (read_vs = 0; read_vs < vs_count; read_vs++)
         {
-            cl_device_id rend_device = device_list[rend_vs], read_device = device_list[read_vs];
+            cl_device_id rend_device = device_list[rend_vs],
+                         read_device = device_list[read_vs];
             char rend_name[200], read_name[200];
 
-            clGetDeviceInfo(rend_device, CL_DEVICE_NAME, sizeof(rend_name), rend_name, NULL);
-            clGetDeviceInfo(read_device, CL_DEVICE_NAME, sizeof(read_name), read_name, NULL);
+            clGetDeviceInfo(rend_device, CL_DEVICE_NAME, sizeof(rend_name),
+                            rend_name, NULL);
+            clGetDeviceInfo(read_device, CL_DEVICE_NAME, sizeof(read_name),
+                            read_name, NULL);
 
-            log_info("Rendering on: %s, read back on: %s\n", rend_name, read_name);
-            error = test_fence_sync_single( device, context, queue, false, rend_vs, read_vs, rend_device );
+            log_info("Rendering on: %s, read back on: %s\n", rend_name,
+                     read_name);
+            error = test_fence_sync_single(device, context, queue, false,
+                                           rend_vs, read_vs, rend_device);
             any_failed |= error;
-            if( error != 0 )
-                log_error( "ERROR: Implicit syncing with GL sync events failed!\n\n" );
+            if (error != 0)
+                log_error(
+                    "ERROR: Implicit syncing with GL sync events failed!\n\n");
             else
                 log_info("Implicit syncing Passed\n");
 
-            error = test_fence_sync_single( device, context, queue, true, rend_vs, read_vs, rend_device );
+            error = test_fence_sync_single(device, context, queue, true,
+                                           rend_vs, read_vs, rend_device);
             any_failed |= error;
-            if( error != 0 )
-                log_error( "ERROR: Explicit syncing with GL sync events failed!\n\n" );
+            if (error != 0)
+                log_error(
+                    "ERROR: Explicit syncing with GL sync events failed!\n\n");
             else
                 log_info("Explicit syncing Passed\n");
         }

From 79f692d8e59f37236c179ebbca086231d5f5c9bc Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Wed, 21 Jul 2021 01:51:29 -0600
Subject: [PATCH 113/158] subgroups: Fix setting cl_halfs and progress check.
 (#1278)

* subgroups: Fix setting cl_halfs and progress check.

cl_float testing uses set_value such that a generated cl_ulong of 1 is
stored as 1.0F in a logical sense. However, cl_half values aren't
intrinsic to C++ and generated cl_ulongs less than 1024 in particular
are interpreted bitwise as subnormals. The test fails on compute devices
lacking subnormal support. Perform the logical conversion to cl_half.

Fix independent forward progress check.

* subgroups_half: Address review comments

* subgroups_half: Formatting fixes required by check-format

* subgroups_half: Modified to query and use rounding mode supported by device

Co-authored-by: spauls <spauls@qti.qualcomm.com>
---
 test_conformance/subgroups/main.cpp           | 18 +++++++++++++
 .../subgroups/subgroup_common_templates.h     |  2 +-
 test_conformance/subgroups/subhelpers.h       |  3 ++-
 test_conformance/subgroups/test_ifp.cpp       | 26 +++++++++++--------
 4 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/test_conformance/subgroups/main.cpp b/test_conformance/subgroups/main.cpp
index 44416dd729..ebe9455836 100644
--- a/test_conformance/subgroups/main.cpp
+++ b/test_conformance/subgroups/main.cpp
@@ -19,8 +19,10 @@
 #include <string.h>
 #include "procs.h"
 #include "harness/testHarness.h"
+#include "CL/cl_half.h"
 
 MTdata gMTdata;
+cl_half_rounding_mode g_rounding_mode;
 
 test_definition test_list[] = {
     ADD_TEST_VERSION(sub_group_info_ext, Version(2, 0)),
@@ -66,6 +68,22 @@ static test_status InitCL(cl_device_id device)
             ret = TEST_SKIP;
         }
     }
+    // Determine the rounding mode to be used in float to half conversions in
+    // init and reference code
+    const cl_device_fp_config fpConfig = get_default_rounding_mode(device);
+
+    if (fpConfig == CL_FP_ROUND_TO_NEAREST)
+    {
+        g_rounding_mode = CL_HALF_RTE;
+    }
+    else if (fpConfig == CL_FP_ROUND_TO_ZERO && gIsEmbedded)
+    {
+        g_rounding_mode = CL_HALF_RTZ;
+    }
+    else
+    {
+        assert(false && "Unreachable");
+    }
     return ret;
 }
 
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index b30c416b1a..4333e95b6c 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -301,7 +301,7 @@ static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); }
 static subgroups::cl_half to_half(float x)
 {
     subgroups::cl_half value;
-    value.data = cl_half_from_float(x, CL_HALF_RTE);
+    value.data = cl_half_from_float(x, g_rounding_mode);
     return value;
 }
 
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index 93673b3579..9232cdedc0 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -28,6 +28,7 @@
 #define NR_OF_ACTIVE_WORK_ITEMS 4
 
 extern MTdata gMTdata;
+extern cl_half_rounding_mode g_rounding_mode;
 
 struct WorkGroupParams
 {
@@ -1080,7 +1081,7 @@ template <typename Ty>
 typename std::enable_if<TypeManager<Ty>::is_sb_scalar_type::value>::type
 set_value(Ty &lhs, const cl_ulong &rhs)
 {
-    lhs.data = rhs;
+    lhs.data = cl_half_from_float(static_cast<cl_float>(rhs), g_rounding_mode);
 }
 
 // compare for common vectors
diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp
index 428f2cdcde..fccaa8c79d 100644
--- a/test_conformance/subgroups/test_ifp.cpp
+++ b/test_conformance/subgroups/test_ifp.cpp
@@ -360,17 +360,21 @@ int test_ifp_ext(cl_device_id device, cl_context context,
     }
     // ifp only in subgroup functions tests:
     test_status error;
-    error = checkIFPSupport(device, ifpSupport);
-    if (error != TEST_PASS)
+    auto device_cl_version = get_device_cl_version(device);
+    if (device_cl_version >= Version(2, 1))
     {
-        return error;
-    }
-    if (ifpSupport == false)
-    {
-        log_info(
-            "Error reason: the extension cl_khr_subgroups requires that "
-            "Independed forward progress has to be supported by device.\n");
-        return TEST_FAIL;
+        error = checkIFPSupport(device, ifpSupport);
+        if (error != TEST_PASS)
+        {
+            return error;
+        }
+        if (ifpSupport == false)
+        {
+            log_info(
+                "Error reason: the extension cl_khr_subgroups requires that "
+                "Independed forward progress has to be supported by device.\n");
+            return TEST_FAIL;
+        }
     }
     return test_ifp(device, context, queue, num_elements, false);
-}
\ No newline at end of file
+}

From cc0b46e4570d936c38795a20c11315f13fa25c85 Mon Sep 17 00:00:00 2001
From: kalchr01 <83217667+kalchr01@users.noreply.github.com>
Date: Mon, 9 Aug 2021 11:20:40 +0100
Subject: [PATCH 114/158] Add tests for entrypoint
 cl_khr_suggested_local_work_size (#1264)

* Add tests for entrypoint cl_khr_suggested_local_work_size

Tests added within test_conformance/workgroups. The tests cover several
shapes (num dimensions) and sizes of global work size, kernels using
local memory (dynamic and static) and present/non-present global work
offset.

Signed-off-by: Kallia Chronaki <kallia.chronaki@arm.com>

* Fix in comparison for error checking

Signed-off-by: Kallia Chronaki <kallia.chronaki@arm.com>

* 'test_wg_suggested_local_work_size' fixes

* Refactoring of 'test_wg_suggested_local_work_size'

Modifications to reduce code duplication and minimize build time
---
 test_conformance/workgroups/CMakeLists.txt    |   1 +
 test_conformance/workgroups/main.cpp          |  33 +-
 test_conformance/workgroups/procs.h           |  18 +-
 .../test_wg_suggested_local_work_size.cpp     | 611 ++++++++++++++++++
 4 files changed, 646 insertions(+), 17 deletions(-)
 create mode 100644 test_conformance/workgroups/test_wg_suggested_local_work_size.cpp

diff --git a/test_conformance/workgroups/CMakeLists.txt b/test_conformance/workgroups/CMakeLists.txt
index 088860868a..c90bef8858 100644
--- a/test_conformance/workgroups/CMakeLists.txt
+++ b/test_conformance/workgroups/CMakeLists.txt
@@ -14,6 +14,7 @@ set(${MODULE_NAME}_SOURCES
     test_wg_scan_inclusive_add.cpp
     test_wg_scan_inclusive_min.cpp
     test_wg_scan_inclusive_max.cpp
+    test_wg_suggested_local_work_size.cpp
 )
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/workgroups/main.cpp b/test_conformance/workgroups/main.cpp
index 41ffa74192..abb1145b3c 100644
--- a/test_conformance/workgroups/main.cpp
+++ b/test_conformance/workgroups/main.cpp
@@ -24,27 +24,30 @@
 #endif
 
 test_definition test_list[] = {
-    ADD_TEST(work_group_all),
-    ADD_TEST(work_group_any),
-    ADD_TEST(work_group_reduce_add),
-    ADD_TEST(work_group_reduce_min),
-    ADD_TEST(work_group_reduce_max),
-    ADD_TEST(work_group_scan_inclusive_add),
-    ADD_TEST(work_group_scan_inclusive_min),
-    ADD_TEST(work_group_scan_inclusive_max),
-    ADD_TEST(work_group_scan_exclusive_add),
-    ADD_TEST(work_group_scan_exclusive_min),
-    ADD_TEST(work_group_scan_exclusive_max),
-    ADD_TEST(work_group_broadcast_1D),
-    ADD_TEST(work_group_broadcast_2D),
-    ADD_TEST(work_group_broadcast_3D),
+    ADD_TEST_VERSION(work_group_all, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_any, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_reduce_add, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_reduce_min, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_reduce_max, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_inclusive_add, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_inclusive_min, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_inclusive_max, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_exclusive_add, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_exclusive_min, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_scan_exclusive_max, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_broadcast_1D, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_broadcast_2D, Version(2, 0)),
+    ADD_TEST_VERSION(work_group_broadcast_3D, Version(2, 0)),
+    ADD_TEST(work_group_suggested_local_size_1D),
+    ADD_TEST(work_group_suggested_local_size_2D),
+    ADD_TEST(work_group_suggested_local_size_3D)
 };
 
 const int test_num = ARRAY_SIZE(test_list);
 
 test_status InitCL(cl_device_id device) {
     auto version = get_device_cl_version(device);
-    auto expected_min_version = Version(2, 0);
+    auto expected_min_version = Version(1, 2);
     if (version < expected_min_version)
     {
         version_expected_info("Test", "OpenCL",
diff --git a/test_conformance/workgroups/procs.h b/test_conformance/workgroups/procs.h
index 2e6e79e262..6143d52531 100644
--- a/test_conformance/workgroups/procs.h
+++ b/test_conformance/workgroups/procs.h
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2017, 2021 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,6 +16,7 @@
 #include "harness/testHarness.h"
 #include "harness/kernelHelpers.h"
 #include "harness/errorHelpers.h"
+#include "harness/typeWrappers.h"
 #include "harness/conversions.h"
 #include "harness/mt19937.h"
 
@@ -36,3 +37,16 @@ extern int test_work_group_scan_exclusive_max(cl_device_id deviceID, cl_context
 extern int test_work_group_scan_inclusive_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int test_work_group_scan_inclusive_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int test_work_group_scan_inclusive_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int test_work_group_suggested_local_size_1D(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int n_elems);
+extern int test_work_group_suggested_local_size_2D(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int n_elems);
+extern int test_work_group_suggested_local_size_3D(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int n_elems);
diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
new file mode 100644
index 0000000000..1dc1b39c9f
--- /dev/null
+++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
@@ -0,0 +1,611 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "harness/compat.h"
+
+#include <stdio.h>
+#include <iostream>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+#include <CL/cl_ext.h>
+
+/** @brief Gets the number of elements of type s in a fixed length array of s */
+#define NELEMS(s) (sizeof(s) / sizeof((s)[0]))
+#define test_error_ret_and_free(errCode, msg, retValue, ptr)                   \
+    {                                                                          \
+        auto errCodeResult = errCode;                                          \
+        if (errCodeResult != CL_SUCCESS)                                       \
+        {                                                                      \
+            print_error(errCodeResult, msg);                                   \
+            free(ptr);                                                         \
+            return retValue;                                                   \
+        }                                                                      \
+    }
+
+const char* wg_scan_local_work_group_size = R"(
+    bool is_zero_linear_id()
+    {
+        size_t linear_id;
+#if __OPENCL_VERSION__ < CL_VERSION_2_0
+        linear_id = ((get_global_id(2) – get_global_offset(2)) * get_global_size(1) * get_global_size(0)) + 
+                    ((get_global_id(1) – get_global_offset(1)) * get_global_size(0)) + 
+                    (get_global_id(0) – get_global_offset(0));
+#else
+        linear_id = get_global_linear_id();
+#endif
+        return linear_id == 0;
+    }
+
+    uint get_l_size(size_t dim)
+    {
+#if __OPENCL_VERSION__ < CL_VERSION_2_0
+        return get_local_size(dim);
+#else
+        return get_enqueued_local_size(dim);
+#endif
+    }
+
+    __kernel void test_wg_scan_local_work_group_size(global uint *output)
+    {
+        if(!is_zero_linear_id()) return;
+        for (uint i = 0; i < 3; i++)
+        {
+            output[i] = get_l_size(i);
+        }
+    }
+    __kernel void test_wg_scan_local_work_group_size_static_local(
+                                            global uint *output)
+    {
+        __local char c[LOCAL_MEM_SIZE];
+    
+        if(!is_zero_linear_id()) return;
+        for (uint i = 0; i < 3; i++)
+        {
+            output[i] = get_l_size(i);
+        }
+    }
+    __kernel void test_wg_scan_local_work_group_size_dynlocal(
+                                        global uint *output,
+                                        __local char * c)
+    {
+        if(!is_zero_linear_id()) return;
+        for (uint i = 0; i < 3; i++)
+        {
+            output[i] = get_l_size(i);
+        }
+    };)";
+
+bool is_prime(size_t a)
+{
+    size_t c;
+
+    for (c = 2; c < a; c++)
+    {
+        if (a % c == 0) return false;
+    }
+    return true;
+}
+
+bool is_not_prime(size_t a) { return !is_prime(a); }
+
+bool is_not_even(size_t a) { return (is_prime(a) || (a % 2 == 1)); }
+
+bool is_not_odd(size_t a) { return (is_prime(a) || (a % 2 == 0)); }
+
+#define NELEMS(s) (sizeof(s) / sizeof((s)[0]))
+/* The numbers we chose in the value_range are to be used for the second and
+   third dimension of the global work group size. The numbers below cover many
+   different cases: 1024 is a power of 2, 3 is an odd and small prime number, 12
+   is a multiple of 4 but not a power of 2, 1031 is a large odd and prime number
+   and 1 is to test the lack of this dimension if the others are present */
+const size_t value_range[] = { 1024, 3, 12, 1031, 1 };
+/* The value_range_nD contains numbers to be used for the experiments with 2D
+   and 3D global work sizes. This is because we need smaller numbers so that the
+   resulting number of work items is meaningful and does not become too large.
+   The cases here are: 64 that is a power of 2, 3 is an odd and small prime
+   number, 12 is a multiple of 4 but not a power of 2, 113 is a large prime
+   number
+   and 1 is to test the lack of this dimension if the others are present */
+const size_t value_range_nD[] = { 64, 3, 12, 113, 1 };
+const size_t basic_increment = 16;
+const size_t primes_increment = 1;
+enum num_dims
+{
+    _1D = 1,
+    _2D = 2,
+    _3D = 3
+};
+
+int do_test(cl_device_id device, cl_context context, cl_command_queue queue,
+            cl_kernel scan_kernel, int work_dim, size_t global_work_offset[3],
+            size_t test_values[3], size_t dyn_mem_size)
+{
+    size_t local_work_size[] = { 1, 1, 1 };
+    size_t suggested_total_size;
+    size_t workgroupinfo_size;
+    cl_uint kernel_work_size[3] = { 0 };
+    clMemWrapper buffer;
+    cl_platform_id platform;
+
+    int err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
+                              &platform, NULL);
+    test_error_ret(err, "clGetDeviceInfo failed", -1);
+    clGetKernelSuggestedLocalWorkSizeKHR_fn
+        clGetKernelSuggestedLocalWorkSizeKHR =
+            (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+                clGetExtensionFunctionAddressForPlatform(
+                    platform, "clGetKernelSuggestedLocalWorkSizeKHR");
+
+    if (clGetKernelSuggestedLocalWorkSizeKHR == NULL)
+    {
+        log_info("Extension 'cl_khr_suggested_local_work_size' could not be "
+                 "found.\n");
+        return TEST_FAIL;
+    }
+
+    /* Create the actual buffer, using local_buffer as the host pointer, and ask
+     * to copy that into the buffer */
+    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                            sizeof(kernel_work_size), NULL, &err);
+    test_error_ret(err, "clCreateBuffer failed", -1);
+    err = clSetKernelArg(scan_kernel, 0, sizeof(buffer), &buffer);
+    test_error_ret(err, "clSetKernelArg failed", -1);
+    if (dyn_mem_size)
+    {
+        err = clSetKernelArg(scan_kernel, 1, dyn_mem_size, NULL);
+        test_error_ret(err, "clSetKernelArg failed", -1);
+    }
+    err = clGetKernelSuggestedLocalWorkSizeKHR(queue, scan_kernel, work_dim,
+                                               global_work_offset, test_values,
+                                               local_work_size);
+    test_error_ret(err, "clGetKernelSuggestedLocalWorkSizeKHR failed", -1);
+    suggested_total_size =
+        local_work_size[0] * local_work_size[1] * local_work_size[2];
+    err = clGetKernelWorkGroupInfo(
+        scan_kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
+        sizeof(workgroupinfo_size), &workgroupinfo_size, NULL);
+    test_error_ret(err, "clGetKernelWorkGroupInfo failed", -1);
+    if (suggested_total_size > workgroupinfo_size)
+    {
+        std::cout << "The suggested work group size consist of "
+                  << suggested_total_size << " work items.\n"
+                  << "Work items are limited by " << workgroupinfo_size
+                  << std::endl;
+        std::cout << "Size from clGetKernelWorkGroupInfo: "
+                  << workgroupinfo_size;
+        std::cout << "\nSize from clGetKernelSuggestedLocalWorkSizeKHR: "
+                  << local_work_size[0] * local_work_size[1]
+                * local_work_size[2]
+                  << std::endl;
+        return -1;
+    }
+
+    err =
+        clEnqueueNDRangeKernel(queue, scan_kernel, work_dim, global_work_offset,
+                               test_values, // global work size
+                               NULL, 0, NULL, NULL);
+    test_error_ret(err, "clEnqueueNDRangeKernel failed", -1);
+    err = clEnqueueReadBuffer(queue, buffer, CL_NON_BLOCKING, 0,
+                              sizeof(kernel_work_size), kernel_work_size, 0,
+                              NULL, NULL);
+    test_error_ret(err, "clEnqueueReadBuffer failed", -1);
+    err = clFinish(queue);
+    test_error_ret(err, "clFinish failed", -1);
+
+    if (kernel_work_size[0] != local_work_size[0]
+        || kernel_work_size[1] != local_work_size[1]
+        || kernel_work_size[2] != local_work_size[2])
+    {
+        std::cout
+            << "Kernel work size differs from local work size suggested:\n"
+            << "Kernel work size: (" << kernel_work_size[0] << ", "
+            << kernel_work_size[1] << ", " << kernel_work_size[2] << ")"
+            << "Local work size: (" << local_work_size[0] << ", "
+            << local_work_size[1] << ", " << local_work_size[2] << ")\n";
+        return -1;
+    }
+    return err;
+}
+
+int do_test_work_group_suggested_local_size(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr,
+    cl_long max_local_mem_size, size_t global_work_offset[], num_dims dim)
+{
+    clProgramWrapper scan_program;
+    clKernelWrapper scan_kernel;
+    int err;
+    size_t test_values[] = { 1, 1, 1 };
+    std::string kernel_names[6] = {
+        "test_wg_scan_local_work_group_size",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_static_local",
+        "test_wg_scan_local_work_group_size_dynlocal"
+    };
+    std::string str_local_mem_size[6] = {
+        "-DLOCAL_MEM_SIZE=1",     "-DLOCAL_MEM_SIZE=1024",
+        "-DLOCAL_MEM_SIZE=4096",  "-DLOCAL_MEM_SIZE=16384",
+        "-DLOCAL_MEM_SIZE=32768", "-DLOCAL_MEM_SIZE=1"
+    };
+    size_t local_mem_size[6] = { 1, 1024, 4096, 16384, 32768, 1 };
+    size_t dyn_mem_size[6] = { 0, 0, 0, 0, 0, 1024 };
+    cl_ulong kernel_local_mem_size;
+    for (int kernel_num = 0; kernel_num < 6; kernel_num++)
+    {
+        if (max_local_mem_size < local_mem_size[kernel_num]) continue;
+        // Create the kernel
+        err = create_single_kernel_helper(
+            context, &scan_program, &scan_kernel, 1,
+            &wg_scan_local_work_group_size, (kernel_names[kernel_num]).c_str(),
+            (str_local_mem_size[kernel_num]).c_str());
+        test_error_ret(err,
+                       ("create_single_kernel_helper failed for kernel "
+                        + kernel_names[kernel_num])
+                           .c_str(),
+                       -1);
+
+        // Check if the local memory used by the kernel is going to exceed the
+        // max_local_mem_size
+        err = clGetKernelWorkGroupInfo(
+            scan_kernel, device, CL_KERNEL_LOCAL_MEM_SIZE,
+            sizeof(kernel_local_mem_size), &kernel_local_mem_size, NULL);
+        test_error_ret(err, "clGetKernelWorkGroupInfo failed", -1);
+        if (kernel_local_mem_size > max_local_mem_size) continue;
+        // return error if no number is found due to the skip condition
+        err = -1;
+        unsigned int j = 0;
+        size_t num_elems = NELEMS(value_range);
+        for (size_t i = start; i < end; i += incr)
+        {
+            if (skip_cond(i)) continue;
+            err = 0;
+            test_values[0] = i;
+            if (dim == _2D) test_values[1] = value_range_nD[j++ % num_elems];
+            if (dim == _3D)
+            {
+                test_values[1] = value_range_nD[j++ % num_elems];
+                test_values[2] = value_range_nD[rand() % num_elems];
+            }
+            err |= do_test(device, context, queue, scan_kernel, dim,
+                           global_work_offset, test_values,
+                           dyn_mem_size[kernel_num]);
+            test_error_ret(
+                err,
+                ("do_test failed for kernel " + kernel_names[kernel_num])
+                    .c_str(),
+                -1);
+        }
+    }
+    return err;
+}
+
+int test_work_group_suggested_local_size_1D(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue, int n_elems)
+{
+    if (!is_extension_available(device, "cl_khr_suggested_local_work_size"))
+    {
+        log_info("Device does not support 'cl_khr_suggested_local_work_size'. "
+                 "Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_long max_local_mem_size;
+    cl_int err =
+        clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.",
+                   -1);
+
+    size_t start, end, incr;
+    size_t global_work_offset[] = { 0, 0, 0 };
+    size_t max_work_items = 0;
+    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(max_work_items), &max_work_items, NULL);
+
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_1D for odds failed.", -1);
+    log_info("test_work_group_suggested_local_size_1D odds passed\n");
+
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_1D for evens failed.", -1);
+    log_info("test_work_group_suggested_local_size_1D evens passed\n");
+
+    // primes
+    start = max_work_items + 1;
+    end = 2 * max_work_items;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_1D for primes failed.", -1);
+    log_info("test_work_group_suggested_local_size_1D primes passed\n");
+
+    global_work_offset[0] = 10;
+    global_work_offset[1] = 10;
+    global_work_offset[2] = 10;
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_1D for odds with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_1D odds with "
+             "global_work_offset passed\n");
+
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_1D for evens with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_1D evens with "
+             "global_work_offset passed\n");
+
+    // primes
+    start = max_work_items + 1;
+    end = 2 * max_work_items;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _1D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_1D for primes with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_1D primes with "
+             "global_work_offset passed\n");
+
+    return err;
+}
+
+int test_work_group_suggested_local_size_2D(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue, int n_elems)
+{
+    if (!is_extension_available(device, "cl_khr_suggested_local_work_size"))
+    {
+        log_info("Device does not support 'cl_khr_suggested_local_work_size'. "
+                 "Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_long max_local_mem_size;
+    cl_int err =
+        clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.",
+                   -1);
+
+    size_t start, end, incr;
+    size_t global_work_offset[] = { 0, 0, 0 };
+    size_t max_work_items = 0;
+    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(max_work_items), &max_work_items, NULL);
+
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_2D for odds failed.", -1);
+    log_info("test_work_group_suggested_local_size_2D odds passed\n");
+
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_2D for evens failed.", -1);
+    log_info("test_work_group_suggested_local_size_2D evens passed\n");
+
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_2D for primes failed.", -1);
+    log_info("test_work_group_suggested_local_size_2D primes passed\n");
+
+    global_work_offset[0] = 10;
+    global_work_offset[1] = 10;
+    global_work_offset[2] = 10;
+
+    // odds
+    start = 1;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_2D for odds with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_2D odds with "
+             "global_work_offset passed\n");
+
+    // evens
+    start = 2;
+    end = max_work_items;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_2D for evens with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_2D evens with "
+             "global_work_offset passed\n");
+
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _2D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_2D for primes with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_2D primes with "
+             "global_work_offset passed\n");
+
+    return err;
+}
+
+int test_work_group_suggested_local_size_3D(cl_device_id device,
+                                            cl_context context,
+                                            cl_command_queue queue, int n_elems)
+{
+    if (!is_extension_available(device, "cl_khr_suggested_local_work_size"))
+    {
+        log_info("Device does not support 'cl_khr_suggested_local_work_size'. "
+                 "Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    cl_long max_local_mem_size;
+    cl_int err =
+        clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.",
+                   -1);
+
+    size_t start, end, incr;
+    size_t global_work_offset[] = { 0, 0, 0 };
+    size_t max_work_items = 0;
+    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(max_work_items), &max_work_items, NULL);
+
+    // odds
+    start = 1;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_3D for odds failed.", -1);
+    log_info("test_work_group_suggested_local_size_3D odds passed\n");
+
+    // evens
+    start = 2;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_3D for evens failed.", -1);
+    log_info("test_work_group_suggested_local_size_3D evens passed\n");
+
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(
+        err, "test_work_group_suggested_local_size_3D for primes failed.", -1);
+    log_info("test_work_group_suggested_local_size_3D primes passed\n");
+
+    global_work_offset[0] = 10;
+    global_work_offset[1] = 10;
+    global_work_offset[2] = 10;
+
+    // odds
+    start = 1;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_odd, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_3D for odds with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_3D odds with "
+             "global_work_offset passed\n");
+
+    // evens
+    start = 2;
+    end = max_work_items / 2;
+    incr = basic_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_even, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_3D for evens with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_3D evens with "
+             "global_work_offset passed\n");
+
+    // primes
+    start = max_work_items + 1;
+    end = max_work_items + max_work_items / 4;
+    incr = primes_increment;
+    err = do_test_work_group_suggested_local_size(
+        device, context, queue, is_not_prime, start, end, incr,
+        max_local_mem_size, global_work_offset, _3D);
+    test_error_ret(err,
+                   "test_work_group_suggested_local_size_3D for primes with "
+                   "global_work_offset failed.",
+                   -1);
+    log_info("test_work_group_suggested_local_size_3D primes with "
+             "global_work_offset passed\n");
+
+    return err;
+}
\ No newline at end of file

From 4759e5cae0e3b3b6dd841fe28ad01f4b4f2478e6 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Wed, 11 Aug 2021 10:03:44 -0700
Subject: [PATCH 115/158] remove testing for scalar vloada_half (#1293)

---
 test_conformance/half/Test_vLoadHalf.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/test_conformance/half/Test_vLoadHalf.cpp b/test_conformance/half/Test_vLoadHalf.cpp
index 52867c25e7..5dfac7a30e 100644
--- a/test_conformance/half/Test_vLoadHalf.cpp
+++ b/test_conformance/half/Test_vLoadHalf.cpp
@@ -37,14 +37,12 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned )
     const char *vector_size_names[]   = {"1", "2", "4", "8", "16", "3"};
 
     int minVectorSize = kMinVectorSize;
-    // There is no aligned scalar vloada_half in CL 1.1
-#if ! defined( CL_VERSION_1_1 ) && ! defined(__APPLE__)
-    vlog("Note: testing vloada_half.\n");
-    if (aligned && minVectorSize == 0)
-        minVectorSize = 1;
-#endif
 
-    for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+    // There is no aligned scalar vloada_half
+    if (aligned && minVectorSize == 0) minVectorSize = 1;
+
+    for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+         vectorSize++)
     {
 
         int effectiveVectorSize = g_arrVecSizes[vectorSize];
@@ -81,7 +79,7 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned )
             "{\n"
             "   size_t i = get_global_id(0);\n"
             "   f[i] = vloada_half3( i, p );\n"
-            "   ((__global float *)f)[4*i+3] = vloada_half(4*i+3,p);\n"
+            "   ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
             "}\n"
         };
 

From 1aa930957a3f7ca6df30b64f61d082f2359fe486 Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou@intel.com>
Date: Thu, 12 Aug 2021 01:04:21 +0800
Subject: [PATCH 116/158] Temporarily disable the test_kernel_attributes test
 case (#1297)

* Temporarily disable the test_kernel_attributes test case

Per OpenCL spec on CL_KERNEL_ATTRIBUTES, for kernels not created from OpenCL C
source and the clCreateProgramWithSource API call the string returned from this
query will be empty.
But in test_kernel_attributes test, it read from bc binary and expect to get
kernel attribute, which is not consistent with OpenCL spec.

* Fix clang format issue
---
 test_conformance/spir/main.cpp | 73 ++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/test_conformance/spir/main.cpp b/test_conformance/spir/main.cpp
index 3a18988c70..06caf33bf7 100644
--- a/test_conformance/spir/main.cpp
+++ b/test_conformance/spir/main.cpp
@@ -6615,40 +6615,45 @@ struct sub_suite
 };
 
 static const sub_suite spir_suites[] = {
-    {"api",                         "api",                       test_api},
-    {"api_double",                  "api",                       test_api_double},
-    {"atomics",                     "atomics",                   test_atomics},
-    {"basic",                       "basic",                     test_basic},
-    {"basic_double",                "basic",                     test_basic_double},
-    {"commonfns",                   "commonfns",                 test_commonfns},
-    {"commonfns_double",            "commonfns",                 test_commonfns_double},
-    {"conversions",                 "conversions",               test_conversions},
-    {"conversions_double",          "conversions",               test_conversions_double},
-    {"geometrics",                  "geometrics",                test_geometrics},
-    {"geometrics_double",           "geometrics",                test_geometrics_double},
-    {"half",                        "half",                      test_half},
-    {"half_double",                 "half",                      test_half_double},
-    {"kernel_image_methods",        "kernel_image_methods",      test_kernel_image_methods},
-    {"images_kernel_read_write",    "images_kernel_read_write",  test_images_kernel_read_write},
-    {"images_samplerlessRead",      "images_samplerlessRead",    test_images_samplerless_read},
-    {"integer_ops",                 "integer_ops",               test_integer_ops},
-    {"math_brute_force",            "math_brute_force",          test_math_brute_force},
-    {"math_brute_force_double",     "math_brute_force",          test_math_brute_force_double},
-    {"printf",                      "printf",                    test_printf},
-    {"profiling",                   "profiling",                 test_profiling},
-    {"relationals",                 "relationals",               test_relationals},
-    {"relationals_double",          "relationals",               test_relationals_double},
-    {"select",                      "select",                    test_select},
-    {"select_double",               "select",                    test_select_double},
-    {"vec_align",                   "vec_align",                 test_vec_align},
-    {"vec_align_double",            "vec_align",                 test_vec_align_double},
-    {"vec_step",                    "vec_step",                  test_vec_step},
-    {"vec_step_double",             "vec_step",                  test_vec_step_double},
-    {"compile_and_link",            "compile_and_link",          test_compile_and_link},
-    {"sampler_enumeration",         "sampler_enumeration",       test_sampler_enumeration},
-    {"enum_values",                 "enum_values",               test_enum_values},
-    {"kernel_attributes",           "kernel_attributes",         test_kernel_attributes},
-    {"binary_type",                  "binary_type",              test_binary_type},
+    { "api", "api", test_api },
+    { "api_double", "api", test_api_double },
+    { "atomics", "atomics", test_atomics },
+    { "basic", "basic", test_basic },
+    { "basic_double", "basic", test_basic_double },
+    { "commonfns", "commonfns", test_commonfns },
+    { "commonfns_double", "commonfns", test_commonfns_double },
+    { "conversions", "conversions", test_conversions },
+    { "conversions_double", "conversions", test_conversions_double },
+    { "geometrics", "geometrics", test_geometrics },
+    { "geometrics_double", "geometrics", test_geometrics_double },
+    { "half", "half", test_half },
+    { "half_double", "half", test_half_double },
+    { "kernel_image_methods", "kernel_image_methods",
+      test_kernel_image_methods },
+    { "images_kernel_read_write", "images_kernel_read_write",
+      test_images_kernel_read_write },
+    { "images_samplerlessRead", "images_samplerlessRead",
+      test_images_samplerless_read },
+    { "integer_ops", "integer_ops", test_integer_ops },
+    { "math_brute_force", "math_brute_force", test_math_brute_force },
+    { "math_brute_force_double", "math_brute_force",
+      test_math_brute_force_double },
+    { "printf", "printf", test_printf },
+    { "profiling", "profiling", test_profiling },
+    { "relationals", "relationals", test_relationals },
+    { "relationals_double", "relationals", test_relationals_double },
+    { "select", "select", test_select },
+    { "select_double", "select", test_select_double },
+    { "vec_align", "vec_align", test_vec_align },
+    { "vec_align_double", "vec_align", test_vec_align_double },
+    { "vec_step", "vec_step", test_vec_step },
+    { "vec_step_double", "vec_step", test_vec_step_double },
+    { "compile_and_link", "compile_and_link", test_compile_and_link },
+    { "sampler_enumeration", "sampler_enumeration", test_sampler_enumeration },
+    { "enum_values", "enum_values", test_enum_values },
+    // {"kernel_attributes",           "kernel_attributes",
+    // test_kernel_attributes}, // disabling temporarily, see GitHub #1284
+    { "binary_type", "binary_type", test_binary_type },
 };
 
 

From 6da9c6b68f9643a077f7281451b59f444a77a991 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Wed, 11 Aug 2021 19:06:10 +0200
Subject: [PATCH 117/158] Fix double free in c11_atomics tests for SVM
 allocations (#1286)

* Only Clang format changes

* Fix double free object for SVM allocations

* Fix double free - review fixes
---
 test_conformance/c11_atomics/common.h | 2556 +++++++++++++------------
 1 file changed, 1381 insertions(+), 1175 deletions(-)

diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h
index bbcc68c657..d30259f0f1 100644
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -28,10 +28,9 @@
 #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
 #define MAX_HOST_THREADS GetThreadCount()
 
-#define EXECUTE_TEST(error, test)\
-  error |= test;\
-  if(error && !gContinueOnError)\
-  return error;
+#define EXECUTE_TEST(error, test)                                              \
+    error |= test;                                                             \
+    if (error && !gContinueOnError) return error;
 
 enum TExplicitAtomicType
 {
@@ -57,764 +56,918 @@ enum TExplicitMemoryScopeType
     MEMORY_SCOPE_ALL_SVM_DEVICES
 };
 
-extern bool gHost; // temporary flag for testing native host threads (test verification)
+extern bool
+    gHost; // temporary flag for testing native host threads (test verification)
 extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
 extern bool gContinueOnError; // execute all cases even when errors detected
-extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
+extern bool
+    gNoGlobalVariables; // disable cases with global atomics in program scope
 extern bool gNoGenericAddressSpace; // disable cases with generic address space
 extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
 extern bool gDebug; // print OpenCL kernel code
-extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
-extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device
+extern int gInternalIterations; // internal test iterations for atomic
+                                // operation, sufficient to verify atomicity
+extern int
+    gMaxDeviceThreads; // maximum number of threads executed on OCL device
 extern cl_device_atomic_capabilities gAtomicMemCap,
     gAtomicFenceCap; // atomic memory and fence capabilities for this device
 
-extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
-extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
+extern const char *
+get_memory_order_type_name(TExplicitMemoryOrderType orderType);
+extern const char *
+get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
 
 extern cl_int getSupportedMemoryOrdersAndScopes(
     cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
     std::vector<TExplicitMemoryScopeType> &memoryScopes);
 
-class AtomicTypeInfo
-{
+class AtomicTypeInfo {
 public:
-  TExplicitAtomicType _type;
-  AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
-  cl_uint Size(cl_device_id device);
-  const char* AtomicTypeName();
-  const char* RegularTypeName();
-  const char* AddSubOperandTypeName();
-  int IsSupported(cl_device_id device);
+    TExplicitAtomicType _type;
+    AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
+    cl_uint Size(cl_device_id device);
+    const char *AtomicTypeName();
+    const char *RegularTypeName();
+    const char *AddSubOperandTypeName();
+    int IsSupported(cl_device_id device);
 };
 
-template<typename HostDataType>
-class AtomicTypeExtendedInfo : public AtomicTypeInfo
-{
+template <typename HostDataType>
+class AtomicTypeExtendedInfo : public AtomicTypeInfo {
 public:
-  AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
-  HostDataType MinValue();
-  HostDataType MaxValue();
-  HostDataType SpecialValue(cl_uchar x)
-  {
-    HostDataType tmp;
-    cl_uchar *ptr = (cl_uchar*)&tmp;
-    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
-      ptr[i] = x;
-    return tmp;
-  }
-  HostDataType SpecialValue(cl_ushort x)
-  {
-    HostDataType tmp;
-    cl_ushort *ptr = (cl_ushort*)&tmp;
-    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
-      ptr[i] = x;
-    return tmp;
-  }
+    AtomicTypeExtendedInfo(TExplicitAtomicType type): AtomicTypeInfo(type) {}
+    HostDataType MinValue();
+    HostDataType MaxValue();
+    HostDataType SpecialValue(cl_uchar x)
+    {
+        HostDataType tmp;
+        cl_uchar *ptr = (cl_uchar *)&tmp;
+        for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_uchar); i++)
+            ptr[i] = x;
+        return tmp;
+    }
+    HostDataType SpecialValue(cl_ushort x)
+    {
+        HostDataType tmp;
+        cl_ushort *ptr = (cl_ushort *)&tmp;
+        for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_ushort); i++)
+            ptr[i] = x;
+        return tmp;
+    }
 };
 
-class CTest  {
+class CTest {
 public:
-  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
+    virtual int Execute(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements) = 0;
 };
 
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTest : CTest
-{
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTest : CTest {
 public:
-  typedef struct {
-    CBasicTest *test;
-    cl_uint tid;
-    cl_uint threadCount;
-    volatile HostAtomicType *destMemory;
-    HostDataType *oldValues;
-  } THostThreadContext;
-  static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
-  {
-    THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
-    threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
-    return 0;
-  }
-  CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
-    _maxDeviceThreads(MAX_DEVICE_THREADS),
-    _dataType(dataType), _useSVM(useSVM), _startValue(255),
-    _localMemory(false), _declaredInProgram(false),
-    _usedInFunction(false), _genericAddrSpace(false),
-    _oldValueCheck(true), _localRefValues(false),
-    _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
-  {
-  }
-  virtual ~CBasicTest()
-  {
-    if(_passCount)
-      log_info("  %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return 1;
-  }
-  virtual cl_uint NumNonAtomicVariablesPerThread()
-  {
-    return 1;
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    return false;
-  }
-  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
-  {
-    return false;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    return false;
-  }
-  virtual std::string PragmaHeader(cl_device_id deviceID);
-  virtual std::string ProgramHeader(cl_uint maxNumDestItems);
-  virtual std::string FunctionCode();
-  virtual std::string KernelCode(cl_uint maxNumDestItems);
-  virtual std::string ProgramCore() = 0;
-  virtual std::string SingleTestName()
-  {
-    std::string testName = LocalMemory() ? "local" : "global";
-    testName += " ";
-    testName += DataType().AtomicTypeName();
-    if(DeclaredInProgram())
-    {
-      testName += " declared in program";
-    }
-    if(DeclaredInProgram() && UsedInFunction())
-      testName += ",";
-    if(UsedInFunction())
-    {
-      testName += " used in ";
-      if(GenericAddrSpace())
-        testName += "generic ";
-      testName += "function";
-    }
-    return testName;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
-  int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    UsedInFunction(false);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    UsedInFunction(true);
-    GenericAddrSpace(false);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    GenericAddrSpace(true);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    GenericAddrSpace(false);
-    return error;
-  }
-  int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    DeclaredInProgram(false);
-    EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
-    if(!UseSVM())
-    {
-      DeclaredInProgram(true);
-      EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
-    }
-    return error;
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    if(_maxDeviceThreads > 0 && !UseSVM())
-    {
-      LocalMemory(true);
-      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
-    }
-    if(_maxDeviceThreads+MaxHostThreads() > 0)
-    {
-      LocalMemory(false);
-      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
-    }
-    return error;
-  }
-  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  {
-    if(sizeof(HostAtomicType) != DataType().Size(deviceID))
-    {
-      log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
-      return -1;
-    }
-    if(sizeof(HostAtomicType) != sizeof(HostDataType))
-    {
-      log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
-      return -1;
-    }
-    // Verify we can run first
-    if(UseSVM() && !gUseHostPtr)
-    {
-      cl_device_svm_capabilities caps;
-      cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
-      test_error(error, "clGetDeviceInfo failed");
-      if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
-      {
-        log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
-        // implicit pass
+    typedef struct
+    {
+        CBasicTest *test;
+        cl_uint tid;
+        cl_uint threadCount;
+        volatile HostAtomicType *destMemory;
+        HostDataType *oldValues;
+    } THostThreadContext;
+    static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id,
+                                     void *userInfo)
+    {
+        THostThreadContext *threadContext =
+            ((THostThreadContext *)userInfo) + job_id;
+        threadContext->test->HostFunction(
+            threadContext->tid, threadContext->threadCount,
+            threadContext->destMemory, threadContext->oldValues);
         return 0;
-      }
     }
-    if(!DataType().IsSupported(deviceID))
+    CBasicTest(TExplicitAtomicType dataType, bool useSVM)
+        : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType),
+          _useSVM(useSVM), _startValue(255), _localMemory(false),
+          _declaredInProgram(false), _usedInFunction(false),
+          _genericAddrSpace(false), _oldValueCheck(true),
+          _localRefValues(false), _maxGroupSize(0), _passCount(0),
+          _iterations(gInternalIterations)
+    {}
+    virtual ~CBasicTest()
+    {
+        if (_passCount)
+            log_info("  %u tests executed successfully for %s\n", _passCount,
+                     DataType().AtomicTypeName());
+    }
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
+    {
+        return 1;
+    }
+    virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
+    {
+        return false;
+    }
+    virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                              MTdata d)
+    {
+        return false;
+    }
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
     {
-      log_info("\t%s not supported\n", DataType().AtomicTypeName());
-      // implicit pass or host test (debug feature)
-      if(UseSVM())
+        return false;
+    }
+    virtual std::string PragmaHeader(cl_device_id deviceID);
+    virtual std::string ProgramHeader(cl_uint maxNumDestItems);
+    virtual std::string FunctionCode();
+    virtual std::string KernelCode(cl_uint maxNumDestItems);
+    virtual std::string ProgramCore() = 0;
+    virtual std::string SingleTestName()
+    {
+        std::string testName = LocalMemory() ? "local" : "global";
+        testName += " ";
+        testName += DataType().AtomicTypeName();
+        if (DeclaredInProgram())
+        {
+            testName += " declared in program";
+        }
+        if (DeclaredInProgram() && UsedInFunction()) testName += ",";
+        if (UsedInFunction())
+        {
+            testName += " used in ";
+            if (GenericAddrSpace()) testName += "generic ";
+            testName += "function";
+        }
+        return testName;
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue);
+    int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        int error = 0;
+        UsedInFunction(false);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        UsedInFunction(true);
+        GenericAddrSpace(false);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        GenericAddrSpace(true);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        GenericAddrSpace(false);
+        return error;
+    }
+    int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue)
+    {
+        int error = 0;
+        DeclaredInProgram(false);
+        EXECUTE_TEST(error,
+                     ExecuteForEachPointerType(deviceID, context, queue));
+        if (!UseSVM())
+        {
+            DeclaredInProgram(true);
+            EXECUTE_TEST(error,
+                         ExecuteForEachPointerType(deviceID, context, queue));
+        }
+        return error;
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        int error = 0;
+        if (_maxDeviceThreads > 0 && !UseSVM())
+        {
+            LocalMemory(true);
+            EXECUTE_TEST(
+                error, ExecuteForEachDeclarationType(deviceID, context, queue));
+        }
+        if (_maxDeviceThreads + MaxHostThreads() > 0)
+        {
+            LocalMemory(false);
+            EXECUTE_TEST(
+                error, ExecuteForEachDeclarationType(deviceID, context, queue));
+        }
+        return error;
+    }
+    virtual int Execute(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
+    {
+        if (sizeof(HostAtomicType) != DataType().Size(deviceID))
+        {
+            log_info("Invalid test: Host atomic type size (%u) is different "
+                     "than OpenCL type size (%u)\n",
+                     (cl_uint)sizeof(HostAtomicType),
+                     DataType().Size(deviceID));
+            return -1;
+        }
+        if (sizeof(HostAtomicType) != sizeof(HostDataType))
+        {
+            log_info("Invalid test: Host atomic type size (%u) is different "
+                     "than corresponding type size (%u)\n",
+                     (cl_uint)sizeof(HostAtomicType),
+                     (cl_uint)sizeof(HostDataType));
+            return -1;
+        }
+        // Verify we can run first
+        if (UseSVM() && !gUseHostPtr)
+        {
+            cl_device_svm_capabilities caps;
+            cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES,
+                                           sizeof(caps), &caps, 0);
+            test_error(error, "clGetDeviceInfo failed");
+            if ((caps & CL_DEVICE_SVM_ATOMICS) == 0)
+            {
+                log_info("\t%s - SVM_ATOMICS not supported\n",
+                         DataType().AtomicTypeName());
+                // implicit pass
+                return 0;
+            }
+        }
+        if (!DataType().IsSupported(deviceID))
+        {
+            log_info("\t%s not supported\n", DataType().AtomicTypeName());
+            // implicit pass or host test (debug feature)
+            if (UseSVM()) return 0;
+            _maxDeviceThreads = 0;
+        }
+        if (_maxDeviceThreads + MaxHostThreads() == 0) return 0;
+        return ExecuteForEachParameterSet(deviceID, context, queue);
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        log_info("Empty thread function %u\n", (cl_uint)tid);
+    }
+    AtomicTypeExtendedInfo<HostDataType> DataType() const
+    {
+        return AtomicTypeExtendedInfo<HostDataType>(_dataType);
+    }
+    cl_uint _maxDeviceThreads;
+    virtual cl_uint MaxHostThreads()
+    {
+        if (UseSVM() || gHost)
+            return MAX_HOST_THREADS;
+        else
+            return 0;
+    }
+
+    int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
+                          TExplicitMemoryOrderType memoryOrder)
+    {
+        /*
+            Differentiation between atomic fence and other atomic operations
+            does not need to occur here.
+
+            The initialisation of this test checks that the minimum required
+            capabilities are supported by this device.
+
+            The following switches allow the test to skip if optional
+           capabilites are not supported by the device.
+          */
+        switch (memoryScope)
+        {
+            case MEMORY_SCOPE_EMPTY: {
+                break;
+            }
+            case MEMORY_SCOPE_WORK_GROUP: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_SCOPE_DEVICE: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_SCOPE_ALL_DEVICES: // fallthough
+            case MEMORY_SCOPE_ALL_SVM_DEVICES: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            default: {
+                log_info("Invalid memory scope\n");
+                break;
+            }
+        }
+
+        switch (memoryOrder)
+        {
+            case MEMORY_ORDER_EMPTY: {
+                break;
+            }
+            case MEMORY_ORDER_RELAXED: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_ORDER_ACQUIRE:
+            case MEMORY_ORDER_RELEASE:
+            case MEMORY_ORDER_ACQ_REL: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_ORDER_SEQ_CST: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            default: {
+                log_info("Invalid memory order\n");
+                break;
+            }
+        }
+
         return 0;
-      _maxDeviceThreads = 0;
-    }
-    if(_maxDeviceThreads+MaxHostThreads() == 0)
-      return 0;
-    return ExecuteForEachParameterSet(deviceID, context, queue);
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    log_info("Empty thread function %u\n", (cl_uint)tid);
-  }
-  AtomicTypeExtendedInfo<HostDataType> DataType() const
-  {
-    return AtomicTypeExtendedInfo<HostDataType>(_dataType);
-  }
-  cl_uint _maxDeviceThreads;
-  virtual cl_uint MaxHostThreads()
-  {
-    if(UseSVM() || gHost)
-      return MAX_HOST_THREADS;
-    else
-      return 0;
-  }
-
-  int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
-                        TExplicitMemoryOrderType memoryOrder)
-  {
-      /*
-          Differentiation between atomic fence and other atomic operations
-          does not need to occur here.
-
-          The initialisation of this test checks that the minimum required
-          capabilities are supported by this device.
-
-          The following switches allow the test to skip if optional capabilites
-          are not supported by the device.
-        */
-      switch (memoryScope)
-      {
-          case MEMORY_SCOPE_EMPTY: {
-              break;
-          }
-          case MEMORY_SCOPE_WORK_GROUP: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_SCOPE_DEVICE: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_SCOPE_ALL_DEVICES: // fallthough
-          case MEMORY_SCOPE_ALL_SVM_DEVICES: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          default: {
-              log_info("Invalid memory scope\n");
-              break;
-          }
-      }
-
-      switch (memoryOrder)
-      {
-          case MEMORY_ORDER_EMPTY: {
-              break;
-          }
-          case MEMORY_ORDER_RELAXED: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_ORDER_ACQUIRE:
-          case MEMORY_ORDER_RELEASE:
-          case MEMORY_ORDER_ACQ_REL: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_ORDER_SEQ_CST: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          default: {
-              log_info("Invalid memory order\n");
-              break;
-          }
-      }
-
-      return 0;
-  }
-  virtual bool SVMDataBufferAllSVMConsistent() {return false;}
-  bool UseSVM() {return _useSVM;}
-  void StartValue(HostDataType startValue) {_startValue = startValue;}
-  HostDataType StartValue() {return _startValue;}
-  void LocalMemory(bool local) {_localMemory = local;}
-  bool LocalMemory() {return _localMemory;}
-  void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
-  bool DeclaredInProgram() {return _declaredInProgram;}
-  void UsedInFunction(bool local) {_usedInFunction = local;}
-  bool UsedInFunction() {return _usedInFunction;}
-  void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
-  bool GenericAddrSpace() {return _genericAddrSpace;}
-  void OldValueCheck(bool check) {_oldValueCheck = check;}
-  bool OldValueCheck() {return _oldValueCheck;}
-  void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
-  bool LocalRefValues() {return _localRefValues;}
-  void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
-  cl_uint MaxGroupSize() {return _maxGroupSize;}
-  void CurrentGroupSize(cl_uint currentGroupSize)
-  {
-    if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
-      _currentGroupSize = MaxGroupSize();
-    else
-      _currentGroupSize = currentGroupSize;
-  }
-  cl_uint CurrentGroupSize() {return _currentGroupSize;}
-  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
-  {
-    if(threadCount == 0)
-      return 0;
-    if(LocalMemory())
-      return 1;
-    return threadCount/CurrentGroupSize();
-  }
-  cl_int Iterations() {return _iterations;}
-  std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
+    }
+    virtual bool SVMDataBufferAllSVMConsistent() { return false; }
+    bool UseSVM() { return _useSVM; }
+    void StartValue(HostDataType startValue) { _startValue = startValue; }
+    HostDataType StartValue() { return _startValue; }
+    void LocalMemory(bool local) { _localMemory = local; }
+    bool LocalMemory() { return _localMemory; }
+    void DeclaredInProgram(bool declaredInProgram)
+    {
+        _declaredInProgram = declaredInProgram;
+    }
+    bool DeclaredInProgram() { return _declaredInProgram; }
+    void UsedInFunction(bool local) { _usedInFunction = local; }
+    bool UsedInFunction() { return _usedInFunction; }
+    void GenericAddrSpace(bool genericAddrSpace)
+    {
+        _genericAddrSpace = genericAddrSpace;
+    }
+    bool GenericAddrSpace() { return _genericAddrSpace; }
+    void OldValueCheck(bool check) { _oldValueCheck = check; }
+    bool OldValueCheck() { return _oldValueCheck; }
+    void LocalRefValues(bool localRefValues)
+    {
+        _localRefValues = localRefValues;
+    }
+    bool LocalRefValues() { return _localRefValues; }
+    void MaxGroupSize(cl_uint maxGroupSize) { _maxGroupSize = maxGroupSize; }
+    cl_uint MaxGroupSize() { return _maxGroupSize; }
+    void CurrentGroupSize(cl_uint currentGroupSize)
+    {
+        if (MaxGroupSize() && MaxGroupSize() < currentGroupSize)
+            _currentGroupSize = MaxGroupSize();
+        else
+            _currentGroupSize = currentGroupSize;
+    }
+    cl_uint CurrentGroupSize() { return _currentGroupSize; }
+    virtual cl_uint CurrentGroupNum(cl_uint threadCount)
+    {
+        if (threadCount == 0) return 0;
+        if (LocalMemory()) return 1;
+        return threadCount / CurrentGroupSize();
+    }
+    cl_int Iterations() { return _iterations; }
+    std::string IterationsStr()
+    {
+        std::stringstream ss;
+        ss << _iterations;
+        return ss.str();
+    }
+
 private:
-  const TExplicitAtomicType _dataType;
-  const bool _useSVM;
-  HostDataType	_startValue;
-  bool _localMemory;
-  bool _declaredInProgram;
-  bool _usedInFunction;
-  bool _genericAddrSpace;
-  bool _oldValueCheck;
-  bool _localRefValues;
-  cl_uint _maxGroupSize;
-  cl_uint _currentGroupSize;
-  cl_uint _passCount;
-  const cl_int _iterations;
+    const TExplicitAtomicType _dataType;
+    const bool _useSVM;
+    HostDataType _startValue;
+    bool _localMemory;
+    bool _declaredInProgram;
+    bool _usedInFunction;
+    bool _genericAddrSpace;
+    bool _oldValueCheck;
+    bool _localRefValues;
+    cl_uint _maxGroupSize;
+    cl_uint _currentGroupSize;
+    cl_uint _passCount;
+    const cl_int _iterations;
 };
 
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
-{
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestMemOrderScope
+    : public CBasicTest<HostAtomicType, HostDataType> {
 public:
-  using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
-  using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-  CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string ProgramHeader(cl_uint maxNumDestItems)
-  {
-    std::string header;
-    if(gOldAPI)
-    {
-      std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
-      header +=
-        "#define atomic_store_explicit(x,y,o"+s+")                     atomic_store(x,y)\n"
-        "#define atomic_load_explicit(x,o"+s+")                        atomic_load(x)\n"
-        "#define atomic_exchange_explicit(x,y,o"+s+")                  atomic_exchange(x,y)\n"
-        "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
-        "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+")   atomic_compare_exchange_weak(x,y,z)\n"
-        "#define atomic_fetch_add_explicit(x,y,o"+s+")                 atomic_fetch_add(x,y)\n"
-        "#define atomic_fetch_sub_explicit(x,y,o"+s+")                 atomic_fetch_sub(x,y)\n"
-        "#define atomic_fetch_or_explicit(x,y,o"+s+")                  atomic_fetch_or(x,y)\n"
-        "#define atomic_fetch_xor_explicit(x,y,o"+s+")                 atomic_fetch_xor(x,y)\n"
-        "#define atomic_fetch_and_explicit(x,y,o"+s+")                 atomic_fetch_and(x,y)\n"
-        "#define atomic_fetch_min_explicit(x,y,o"+s+")                 atomic_fetch_min(x,y)\n"
-        "#define atomic_fetch_max_explicit(x,y,o"+s+")                 atomic_fetch_max(x,y)\n"
-        "#define atomic_flag_test_and_set_explicit(x,o"+s+")           atomic_flag_test_and_set(x)\n"
-        "#define atomic_flag_clear_explicit(x,o"+s+")                  atomic_flag_clear(x)\n";
-    }
-    return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
-  }
-  virtual std::string SingleTestName()
-  {
-    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
-    {
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
-    }
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
-    {
-      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
-    }
-    return testName;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    if(LocalMemory() &&
-      MemoryScope() != MEMORY_SCOPE_EMPTY &&
-      MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
-      return 0;
-    if(MemoryScope() == MEMORY_SCOPE_DEVICE)
-      MaxGroupSize(16); // increase number of groups by forcing smaller group size
-    else
-      MaxGroupSize(0); // group size limited by device capabilities
-
-    if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
-        return 0; // skip test - not applicable
-
-    return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    // repeat test for each reasonable memory order/scope combination
-    std::vector<TExplicitMemoryOrderType> memoryOrder;
-    std::vector<TExplicitMemoryScopeType> memoryScope;
-    int error = 0;
-
-    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
-    // we query for the supported ones.
-    test_error_ret(
-        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
-        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
-
-    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
-    {
-      for(unsigned si = 0; si < memoryScope.size(); si++)
-      {
-        if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
-          continue;
-        MemoryOrder(memoryOrder[oi]);
-        MemoryScope(memoryScope[si]);
-        EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
-      }
-    }
-    return error;
-  }
-  void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
-  TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
-  std::string MemoryOrderStr()
-  {
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
-      return std::string(", ")+get_memory_order_type_name(MemoryOrder());
-    return "";
-  }
-  void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
-  TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
-  std::string MemoryScopeStr()
-  {
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
-      return std::string(", ")+get_memory_scope_type_name(MemoryScope());
-    return "";
-  }
-  std::string MemoryOrderScopeStr()
-  {
-    return MemoryOrderStr()+MemoryScopeStr();
-  }
-  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
-  {
-    if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
-      return 1;
-    return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
-  }
-  virtual cl_uint MaxHostThreads()
-  {
-      // block host threads execution for memory scope different than
-      // memory_scope_all_svm_devices
-      if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
-          || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
-      {
-          return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
-      }
-      else
-      {
-          return 0;
-      }
-  }
+    using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
+    using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+    CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false)
+        : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
+    {}
+    virtual std::string ProgramHeader(cl_uint maxNumDestItems)
+    {
+        std::string header;
+        if (gOldAPI)
+        {
+            std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
+            header += "#define atomic_store_explicit(x,y,o" + s
+                + ")                     atomic_store(x,y)\n"
+                  "#define atomic_load_explicit(x,o"
+                + s
+                + ")                        atomic_load(x)\n"
+                  "#define atomic_exchange_explicit(x,y,o"
+                + s
+                + ")                  atomic_exchange(x,y)\n"
+                  "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"
+                + s
+                + ") atomic_compare_exchange_strong(x,y,z)\n"
+                  "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"
+                + s
+                + ")   atomic_compare_exchange_weak(x,y,z)\n"
+                  "#define atomic_fetch_add_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_add(x,y)\n"
+                  "#define atomic_fetch_sub_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_sub(x,y)\n"
+                  "#define atomic_fetch_or_explicit(x,y,o"
+                + s
+                + ")                  atomic_fetch_or(x,y)\n"
+                  "#define atomic_fetch_xor_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_xor(x,y)\n"
+                  "#define atomic_fetch_and_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_and(x,y)\n"
+                  "#define atomic_fetch_min_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_min(x,y)\n"
+                  "#define atomic_fetch_max_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_max(x,y)\n"
+                  "#define atomic_flag_test_and_set_explicit(x,o"
+                + s
+                + ")           atomic_flag_test_and_set(x)\n"
+                  "#define atomic_flag_clear_explicit(x,o"
+                + s + ")                  atomic_flag_clear(x)\n";
+        }
+        return header
+            + CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(
+                   maxNumDestItems);
+    }
+    virtual std::string SingleTestName()
+    {
+        std::string testName =
+            CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+        {
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder()))
+                      .substr(sizeof("memory"));
+        }
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+        {
+            testName += std::string(", ")
+                + std::string(get_memory_scope_type_name(MemoryScope()))
+                      .substr(sizeof("memory"));
+        }
+        return testName;
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        if (LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY
+            && MemoryScope()
+                != MEMORY_SCOPE_WORK_GROUP) // memory scope should only be used
+                                            // for global memory
+            return 0;
+        if (MemoryScope() == MEMORY_SCOPE_DEVICE)
+            MaxGroupSize(
+                16); // increase number of groups by forcing smaller group size
+        else
+            MaxGroupSize(0); // group size limited by device capabilities
+
+        if (CheckCapabilities(MemoryScope(), MemoryOrder())
+            == TEST_SKIPPED_ITSELF)
+            return 0; // skip test - not applicable
+
+        return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
+            deviceID, context, queue);
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        // repeat test for each reasonable memory order/scope combination
+        std::vector<TExplicitMemoryOrderType> memoryOrder;
+        std::vector<TExplicitMemoryScopeType> memoryScope;
+        int error = 0;
+
+        // For OpenCL-3.0 and later some orderings and scopes are optional, so
+        // here we query for the supported ones.
+        test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
+                                                         memoryScope),
+                       "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
+
+        for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
+        {
+            for (unsigned si = 0; si < memoryScope.size(); si++)
+            {
+                if (memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                    && memoryScope[si] != MEMORY_SCOPE_EMPTY)
+                    continue;
+                MemoryOrder(memoryOrder[oi]);
+                MemoryScope(memoryScope[si]);
+                EXECUTE_TEST(
+                    error,
+                    (CBasicTest<HostAtomicType, HostDataType>::
+                         ExecuteForEachParameterSet(deviceID, context, queue)));
+            }
+        }
+        return error;
+    }
+    void MemoryOrder(TExplicitMemoryOrderType memoryOrder)
+    {
+        _memoryOrder = memoryOrder;
+    }
+    TExplicitMemoryOrderType MemoryOrder() { return _memoryOrder; }
+    std::string MemoryOrderStr()
+    {
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+            return std::string(", ")
+                + get_memory_order_type_name(MemoryOrder());
+        return "";
+    }
+    void MemoryScope(TExplicitMemoryScopeType memoryScope)
+    {
+        _memoryScope = memoryScope;
+    }
+    TExplicitMemoryScopeType MemoryScope() { return _memoryScope; }
+    std::string MemoryScopeStr()
+    {
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+            return std::string(", ")
+                + get_memory_scope_type_name(MemoryScope());
+        return "";
+    }
+    std::string MemoryOrderScopeStr()
+    {
+        return MemoryOrderStr() + MemoryScopeStr();
+    }
+    virtual cl_uint CurrentGroupNum(cl_uint threadCount)
+    {
+        if (MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1;
+        return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(
+            threadCount);
+    }
+    virtual cl_uint MaxHostThreads()
+    {
+        // block host threads execution for memory scope different than
+        // memory_scope_all_svm_devices
+        if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
+            || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
+        {
+            return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
+        }
+        else
+        {
+            return 0;
+        }
+    }
+
 private:
-  TExplicitMemoryOrderType _memoryOrder;
-  TExplicitMemoryScopeType _memoryScope;
+    TExplicitMemoryOrderType _memoryOrder;
+    TExplicitMemoryScopeType _memoryScope;
 };
 
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-{
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestMemOrder2Scope
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
 public:
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-
-  CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string SingleTestName()
-  {
-    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
-    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
-    return testName;
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    // repeat test for each reasonable memory order/scope combination
-    std::vector<TExplicitMemoryOrderType> memoryOrder;
-    std::vector<TExplicitMemoryScopeType> memoryScope;
-    int error = 0;
-
-    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
-    // we query for the supported ones.
-    test_error_ret(
-        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
-        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
-
-    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
-    {
-      for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
-      {
-        for(unsigned si = 0; si < memoryScope.size(); si++)
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+
+    CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {}
+    virtual std::string SingleTestName()
+    {
+        std::string testName =
+            CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder()))
+                      .substr(sizeof("memory"));
+        if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder2()))
+                      .substr(sizeof("memory"));
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_scope_type_name(MemoryScope()))
+                      .substr(sizeof("memory"));
+        return testName;
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        // repeat test for each reasonable memory order/scope combination
+        std::vector<TExplicitMemoryOrderType> memoryOrder;
+        std::vector<TExplicitMemoryScopeType> memoryScope;
+        int error = 0;
+
+        // For OpenCL-3.0 and later some orderings and scopes are optional, so
+        // here we query for the supported ones.
+        test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
+                                                         memoryScope),
+                       "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
+
+        for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
         {
-          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
-            && memoryOrder[oi] != memoryOrder[o2i])
-            continue; // both memory order arguments must be set (or none)
-          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
-            && memoryScope[si] != MEMORY_SCOPE_EMPTY)
-            continue; // memory scope without memory order is not allowed
-          MemoryOrder(memoryOrder[oi]);
-          MemoryOrder2(memoryOrder[o2i]);
-          MemoryScope(memoryScope[si]);
-
-          if (CheckCapabilities(MemoryScope(), MemoryOrder())
-              == TEST_SKIPPED_ITSELF)
-              continue; // skip test - not applicable
-
-          if (CheckCapabilities(MemoryScope(), MemoryOrder2())
-              == TEST_SKIPPED_ITSELF)
-              continue; // skip test - not applicable
-
-          EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
+            for (unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
+            {
+                for (unsigned si = 0; si < memoryScope.size(); si++)
+                {
+                    if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                         || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
+                        && memoryOrder[oi] != memoryOrder[o2i])
+                        continue; // both memory order arguments must be set (or
+                                  // none)
+                    if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                         || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
+                        && memoryScope[si] != MEMORY_SCOPE_EMPTY)
+                        continue; // memory scope without memory order is not
+                                  // allowed
+                    MemoryOrder(memoryOrder[oi]);
+                    MemoryOrder2(memoryOrder[o2i]);
+                    MemoryScope(memoryScope[si]);
+
+                    if (CheckCapabilities(MemoryScope(), MemoryOrder())
+                        == TEST_SKIPPED_ITSELF)
+                        continue; // skip test - not applicable
+
+                    if (CheckCapabilities(MemoryScope(), MemoryOrder2())
+                        == TEST_SKIPPED_ITSELF)
+                        continue; // skip test - not applicable
+
+                    EXECUTE_TEST(error,
+                                 (CBasicTest<HostAtomicType, HostDataType>::
+                                      ExecuteForEachParameterSet(
+                                          deviceID, context, queue)));
+                }
+            }
         }
-      }
-    }
-    return error;
-  }
-  void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
-  TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
-  std::string MemoryOrderFailStr()
-  {
-    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
-      return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
-    return "";
-  }
-  std::string MemoryOrderScope()
-  {
-    return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
-  }
+        return error;
+    }
+    void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)
+    {
+        _memoryOrder2 = memoryOrderFail;
+    }
+    TExplicitMemoryOrderType MemoryOrder2() { return _memoryOrder2; }
+    std::string MemoryOrderFailStr()
+    {
+        if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
+            return std::string(", ")
+                + get_memory_order_type_name(MemoryOrder2());
+        return "";
+    }
+    std::string MemoryOrderScope()
+    {
+        return MemoryOrderStr() + MemoryOrderFailStr() + MemoryScopeStr();
+    }
+
 private:
-  TExplicitMemoryOrderType _memoryOrder2;
+    TExplicitMemoryOrderType _memoryOrder2;
 };
 
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
+template <typename HostAtomicType, typename HostDataType>
+std::string
+CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
 {
-  std::string pragma;
-
-  if(gOldAPI)
-  {
-    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
-  }
-  // Create the pragma lines for this kernel
-  if(DataType().Size(deviceID) == 8)
-  {
-    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
-  }
-  if(_dataType == TYPE_ATOMIC_DOUBLE)
-    pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-  return pragma;
+    std::string pragma;
+
+    if (gOldAPI)
+    {
+        pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : "
+                  "enable\n";
+        pragma += "#pragma OPENCL EXTENSION "
+                  "cl_khr_local_int32_extended_atomics : enable\n";
+        pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : "
+                  "enable\n";
+        pragma += "#pragma OPENCL EXTENSION "
+                  "cl_khr_global_int32_extended_atomics : enable\n";
+    }
+    // Create the pragma lines for this kernel
+    if (DataType().Size(deviceID) == 8)
+    {
+        pragma +=
+            "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
+        pragma +=
+            "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
+    }
+    if (_dataType == TYPE_ATOMIC_DOUBLE)
+        pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+    return pragma;
 }
 
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
+template <typename HostAtomicType, typename HostDataType>
+std::string
+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
 {
-  // Create the program header
-  std::string header;
-  std::string aTypeName = DataType().AtomicTypeName();
-  std::string cTypeName = DataType().RegularTypeName();
-  std::string argListForKernel;
-  std::string argListForFunction;
-  std::string argListNoTypes;
-  std::string functionPrototype;
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-
-  if(gOldAPI)
-  {
-    header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
-      "#define atomic_store(x,y)                                (*(x) = y)\n"
-      "#define atomic_load(x)                                   (*(x))\n"
-      "#define ATOMIC_VAR_INIT(x)                               (x)\n"
-      "#define ATOMIC_FLAG_INIT                                 0\n"
-      "#define atomic_init(x,y)                                 atomic_store(x,y)\n";
-    if(aTypeName == "atomic_float")
-      header += "#define atomic_exchange(x,y)                             atomic_xchg(x,y)\n";
-    else if(aTypeName == "atomic_double")
-      header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
-        "{\n"
-        "  long tmp = *(long*)&y, res;\n"
-        "  volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
-        "  res = atom_xchg(tmpA,tmp);\n"
-        "  return *(double*)&res;\n"
-        "}\n";
-    else
-      header += "#define atomic_exchange(x,y)                             atom_xchg(x,y)\n";
-    if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
-      header +=
-      "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
-      "{\n"
-      "  "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
-      "  if(old == *expected)\n"
-      "    return true;\n"
-      "  *expected = old;\n"
-      "  return false;\n"
-      "}\n"
-      "#define atomic_compare_exchange_weak                     atomic_compare_exchange_strong\n";
-    header +=
-      "#define atomic_fetch_add(x,y)                            atom_add(x,y)\n"
-      "#define atomic_fetch_sub(x,y)                            atom_sub(x,y)\n"
-      "#define atomic_fetch_or(x,y)                             atom_or(x,y)\n"
-      "#define atomic_fetch_xor(x,y)                            atom_xor(x,y)\n"
-      "#define atomic_fetch_and(x,y)                            atom_and(x,y)\n"
-      "#define atomic_fetch_min(x,y)                            atom_min(x,y)\n"
-      "#define atomic_fetch_max(x,y)                            atom_max(x,y)\n"
-      "#define atomic_flag_test_and_set(x)                      atomic_exchange(x,1)\n"
-      "#define atomic_flag_clear(x)                             atomic_store(x,0)\n"
-      "\n";
-  }
-  if(!LocalMemory() && DeclaredInProgram())
-  {
-    // additional atomic variable for results copying (last thread will do this)
-    header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
-    // atomic variables declared in program scope - test data
-    std::stringstream ss;
-    ss << maxNumDestItems;
-    header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
-    ss.str("");
-    ss << _startValue;
-    for(cl_uint i = 0; i < maxNumDestItems; i++)
-    {
-      if(aTypeName == "atomic_flag")
-        header +=  "  ATOMIC_FLAG_INIT";
-      else
-        header +=  "  ATOMIC_VAR_INIT("+ss.str()+")";
-      if(i+1 < maxNumDestItems)
-        header += ",";
-      header += "\n";
-    }
-    header+=
-      "};\n"
-      "\n";
-  }
-  return header;
+    // Create the program header
+    std::string header;
+    std::string aTypeName = DataType().AtomicTypeName();
+    std::string cTypeName = DataType().RegularTypeName();
+    std::string argListForKernel;
+    std::string argListForFunction;
+    std::string argListNoTypes;
+    std::string functionPrototype;
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+
+    if (gOldAPI)
+    {
+        header += std::string("#define ") + aTypeName + " " + cTypeName
+            + "\n"
+              "#define atomic_store(x,y)                                (*(x) "
+              "= y)\n"
+              "#define atomic_load(x)                                   "
+              "(*(x))\n"
+              "#define ATOMIC_VAR_INIT(x)                               (x)\n"
+              "#define ATOMIC_FLAG_INIT                                 0\n"
+              "#define atomic_init(x,y)                                 "
+              "atomic_store(x,y)\n";
+        if (aTypeName == "atomic_float")
+            header += "#define atomic_exchange(x,y)                            "
+                      " atomic_xchg(x,y)\n";
+        else if (aTypeName == "atomic_double")
+            header += "double atomic_exchange(volatile " + addressSpace
+                + "atomic_double *x, double y)\n"
+                  "{\n"
+                  "  long tmp = *(long*)&y, res;\n"
+                  "  volatile "
+                + addressSpace + "long *tmpA = (volatile " + addressSpace
+                + "long)x;\n"
+                  "  res = atom_xchg(tmpA,tmp);\n"
+                  "  return *(double*)&res;\n"
+                  "}\n";
+        else
+            header += "#define atomic_exchange(x,y)                            "
+                      " atom_xchg(x,y)\n";
+        if (aTypeName != "atomic_float" && aTypeName != "atomic_double")
+            header += "bool atomic_compare_exchange_strong(volatile "
+                + addressSpace + " " + aTypeName + " *a, " + cTypeName
+                + " *expected, " + cTypeName
+                + " desired)\n"
+                  "{\n"
+                  "  "
+                + cTypeName
+                + " old = atom_cmpxchg(a, *expected, desired);\n"
+                  "  if(old == *expected)\n"
+                  "    return true;\n"
+                  "  *expected = old;\n"
+                  "  return false;\n"
+                  "}\n"
+                  "#define atomic_compare_exchange_weak                     "
+                  "atomic_compare_exchange_strong\n";
+        header += "#define atomic_fetch_add(x,y)                            "
+                  "atom_add(x,y)\n"
+                  "#define atomic_fetch_sub(x,y)                            "
+                  "atom_sub(x,y)\n"
+                  "#define atomic_fetch_or(x,y)                             "
+                  "atom_or(x,y)\n"
+                  "#define atomic_fetch_xor(x,y)                            "
+                  "atom_xor(x,y)\n"
+                  "#define atomic_fetch_and(x,y)                            "
+                  "atom_and(x,y)\n"
+                  "#define atomic_fetch_min(x,y)                            "
+                  "atom_min(x,y)\n"
+                  "#define atomic_fetch_max(x,y)                            "
+                  "atom_max(x,y)\n"
+                  "#define atomic_flag_test_and_set(x)                      "
+                  "atomic_exchange(x,1)\n"
+                  "#define atomic_flag_clear(x)                             "
+                  "atomic_store(x,0)\n"
+                  "\n";
+    }
+    if (!LocalMemory() && DeclaredInProgram())
+    {
+        // additional atomic variable for results copying (last thread will do
+        // this)
+        header += "__global volatile atomic_uint finishedThreads = "
+                  "ATOMIC_VAR_INIT(0);\n";
+        // atomic variables declared in program scope - test data
+        std::stringstream ss;
+        ss << maxNumDestItems;
+        header += std::string("__global volatile ") + aTypeName + " destMemory["
+            + ss.str() + "] = {\n";
+        ss.str("");
+        ss << _startValue;
+        for (cl_uint i = 0; i < maxNumDestItems; i++)
+        {
+            if (aTypeName == "atomic_flag")
+                header += "  ATOMIC_FLAG_INIT";
+            else
+                header += "  ATOMIC_VAR_INIT(" + ss.str() + ")";
+            if (i + 1 < maxNumDestItems) header += ",";
+            header += "\n";
+        }
+        header += "};\n"
+                  "\n";
+    }
+    return header;
 }
 
-template<typename HostAtomicType, typename HostDataType>
+template <typename HostAtomicType, typename HostDataType>
 std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
 {
-  if(!UsedInFunction())
-    return "";
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-  std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
-  if(!GenericAddrSpace())
-    code += addressSpace;
-  code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
-    " *oldValues";
-  if(LocalRefValues())
-    code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
-  code += ")\n"
-    "{\n";
-  code += ProgramCore();
-  code += "}\n"
-    "\n";
-  return code;
+    if (!UsedInFunction()) return "";
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+    std::string code = "void test_atomic_function(uint tid, uint threadCount, "
+                       "uint numDestItems, volatile ";
+    if (!GenericAddrSpace()) code += addressSpace;
+    code += std::string(DataType().AtomicTypeName()) + " *destMemory, __global "
+        + DataType().RegularTypeName() + " *oldValues";
+    if (LocalRefValues())
+        code += std::string(", __local ") + DataType().RegularTypeName()
+            + " *localValues";
+    code += ")\n"
+            "{\n";
+    code += ProgramCore();
+    code += "}\n"
+            "\n";
+    return code;
 }
 
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
+template <typename HostAtomicType, typename HostDataType>
+std::string
+CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
 {
-  std::string aTypeName = DataType().AtomicTypeName();
-  std::string cTypeName = DataType().RegularTypeName();
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-  std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";
-
-  // prepare list of arguments for kernel
-  if(LocalMemory())
-  {
-    code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
-      " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
-  }
-  else
-  {
-    code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
-      ", __global "+cTypeName+" *oldValues";
-  }
-  if(LocalRefValues())
-    code += std::string(", __local ")+cTypeName+" *localValues";
-  code += ")\n"
-    "{\n";
-  if(LocalMemory() && DeclaredInProgram())
-  {
-    // local atomics declared in kernel scope
-    std::stringstream ss;
-    ss << maxNumDestItems;
-    code += std::string("  __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
-  }
-  code += "  uint  tid = get_global_id(0);\n"
-    "\n";
-  if(LocalMemory())
-  {
-      // memory_order_relaxed is sufficient for these initialization operations
-      // as the barrier below will act as a fence, providing an order to the
-      // operations. memory_scope_work_group is sufficient as local memory is
-      // only visible within the work-group.
-      code += R"(
+    std::string aTypeName = DataType().AtomicTypeName();
+    std::string cTypeName = DataType().RegularTypeName();
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+    std::string code = "__kernel void test_atomic_kernel(uint threadCount, "
+                       "uint numDestItems, ";
+
+    // prepare list of arguments for kernel
+    if (LocalMemory())
+    {
+        code += std::string("__global ") + cTypeName + " *finalDest, __global "
+            + cTypeName
+            + " *oldValues,"
+              " volatile "
+            + addressSpace + aTypeName + " *"
+            + (DeclaredInProgram() ? "notUsed" : "") + "destMemory";
+    }
+    else
+    {
+        code += "volatile " + addressSpace
+            + (DeclaredInProgram() ? (cTypeName + " *finalDest")
+                                   : (aTypeName + " *destMemory"))
+            + ", __global " + cTypeName + " *oldValues";
+    }
+    if (LocalRefValues())
+        code += std::string(", __local ") + cTypeName + " *localValues";
+    code += ")\n"
+            "{\n";
+    if (LocalMemory() && DeclaredInProgram())
+    {
+        // local atomics declared in kernel scope
+        std::stringstream ss;
+        ss << maxNumDestItems;
+        code += std::string("  __local volatile ") + aTypeName + " destMemory["
+            + ss.str() + "];\n";
+    }
+    code += "  uint  tid = get_global_id(0);\n"
+            "\n";
+    if (LocalMemory())
+    {
+        // memory_order_relaxed is sufficient for these initialization
+        // operations as the barrier below will act as a fence, providing an
+        // order to the operations. memory_scope_work_group is sufficient as
+        // local memory is only visible within the work-group.
+        code += R"(
               // initialize atomics not reachable from host (first thread
               // is doing this, other threads are waiting on barrier)
               if(get_local_id(0) == 0)
                 for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
                 {)";
-      if (aTypeName == "atomic_flag")
-      {
-          code += R"(
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
                   if(finalDest[dstItemIdx])
                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
                                                       memory_order_relaxed,
@@ -823,512 +976,565 @@ std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumD
                     atomic_flag_clear_explicit(destMemory+dstItemIdx,
                                                memory_order_relaxed,
                                                memory_scope_work_group);)";
-      }
-    else
-    {
-        code += R"(
+        }
+        else
+        {
+            code += R"(
                 atomic_store_explicit(destMemory+dstItemIdx,
                                       finalDest[dstItemIdx],
                                       memory_order_relaxed,
                                       memory_scope_work_group);)";
+        }
+        code += "    }\n"
+                "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                "\n";
     }
-    code +=
-      "    }\n"
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-      "\n";
-  }
-  if (LocalRefValues())
-  {
-    code +=
-      "  // Copy input reference values into local memory\n";
-    if (NumNonAtomicVariablesPerThread() == 1)
-      code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
-    else
+    if (LocalRefValues())
     {
-      std::stringstream ss;
-      ss << NumNonAtomicVariablesPerThread();
-      code +=
-        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
-        "    localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
-    }
-    code +=
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-      "\n";
-  }
-  if (UsedInFunction())
-    code += std::string("  test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
-    (LocalRefValues() ? ", localValues" : "")+");\n";
-  else
-    code += ProgramCore();
-  code += "\n";
-  if (LocalRefValues())
-  {
-    code +=
-      "  // Copy local reference values into output array\n"
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-    if (NumNonAtomicVariablesPerThread() == 1)
-      code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
+        code += "  // Copy input reference values into local memory\n";
+        if (NumNonAtomicVariablesPerThread() == 1)
+            code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
+        else
+        {
+            std::stringstream ss;
+            ss << NumNonAtomicVariablesPerThread();
+            code += "  for(uint rfId = 0; rfId < " + ss.str()
+                + "; rfId++)\n"
+                  "    localValues[get_local_id(0)*"
+                + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
+        }
+        code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                "\n";
+    }
+    if (UsedInFunction())
+        code += std::string("  test_atomic_function(tid, threadCount, "
+                            "numDestItems, destMemory, oldValues")
+            + (LocalRefValues() ? ", localValues" : "") + ");\n";
     else
+        code += ProgramCore();
+    code += "\n";
+    if (LocalRefValues())
     {
-      std::stringstream ss;
-      ss << NumNonAtomicVariablesPerThread();
-      code +=
-        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
-        "    oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
+        code += "  // Copy local reference values into output array\n"
+                "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+        if (NumNonAtomicVariablesPerThread() == 1)
+            code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
+        else
+        {
+            std::stringstream ss;
+            ss << NumNonAtomicVariablesPerThread();
+            code += "  for(uint rfId = 0; rfId < " + ss.str()
+                + "; rfId++)\n"
+                  "    oldValues[tid*"
+                + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str()
+                + "+rfId];\n";
+        }
+        code += "\n";
     }
-    code += "\n";
-  }
-  if(LocalMemory() || DeclaredInProgram())
-  {
-    code += "  // Copy final values to host reachable buffer\n";
-    if(LocalMemory())
-      code +=
-        "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-        "  if(get_local_id(0) == 0) // first thread in workgroup\n";
-    else
-      // global atomics declared in program scope
-      code += R"(
+    if (LocalMemory() || DeclaredInProgram())
+    {
+        code += "  // Copy final values to host reachable buffer\n";
+        if (LocalMemory())
+            code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                    "  if(get_local_id(0) == 0) // first thread in workgroup\n";
+        else
+            // global atomics declared in program scope
+            code += R"(
                 if(atomic_fetch_add_explicit(&finishedThreads, 1u,
                                            memory_order_relaxed,
                                            memory_scope_work_group)
                    == get_global_size(0)-1) // last finished thread
                    )";
-    code +=
-        "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
-    if(aTypeName == "atomic_flag")
-    {
-        code += R"(
+        code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
+                "dstItemIdx++)\n";
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
                 finalDest[dstItemIdx] =
                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
                                                       memory_order_relaxed,
                                                       memory_scope_work_group);)";
-    }
-    else
-    {
-        code += R"(
+        }
+        else
+        {
+            code += R"(
                 finalDest[dstItemIdx] =
                     atomic_load_explicit(destMemory+dstItemIdx,
                                          memory_order_relaxed,
                                          memory_scope_work_group);)";
+        }
     }
-  }
-  code += "}\n"
-    "\n";
-  return code;
+    code += "}\n"
+            "\n";
+    return code;
 }
 
 template <typename HostAtomicType, typename HostDataType>
-int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
+int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue)
 {
-  int error;
-  clProgramWrapper program;
-  clKernelWrapper kernel;
-  size_t threadNum[1];
-  clMemWrapper streams[2];
-  std::vector<HostAtomicType> destItems;
-  HostAtomicType *svmAtomicBuffer = 0;
-  std::vector<HostDataType> refValues, startRefValues;
-  HostDataType *svmDataBuffer = 0;
-  cl_uint deviceThreadCount, hostThreadCount, threadCount;
-  size_t groupSize = 0;
-  std::string programSource;
-  const char *programLine;
-  MTdata d;
-  size_t typeSize = DataType().Size(deviceID);
-
-  deviceThreadCount = _maxDeviceThreads;
-  hostThreadCount = MaxHostThreads();
-  threadCount = deviceThreadCount+hostThreadCount;
-
-  //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
-  log_info("\t%s...\n", SingleTestName().c_str());
-
-  if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
-  {
-    log_info("\t\tTest disabled\n");
-    return 0;
-  }
-  if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
-  {
-    log_info("\t\tTest disabled\n");
-    return 0;
-  }
-
-  // set up work sizes based on device capabilities and test configuration
-  error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
-  test_error(error, "Unable to obtain max work group size for device");
-  CurrentGroupSize((cl_uint)groupSize);
-  if(CurrentGroupSize() > deviceThreadCount)
-    CurrentGroupSize(deviceThreadCount);
-  if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
-    deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
-  threadCount = deviceThreadCount+hostThreadCount;
-
-  // If we're given a num_results function, we need to determine how many result objects we need.
-  // This is the first assessment for current maximum number of threads (exact thread count is not known here)
-  // - needed for program source code generation (arrays of atomics declared in program)
-  cl_uint numDestItems = NumResults(threadCount, deviceID);
-
-  if(deviceThreadCount > 0)
-  {
-      // This loop iteratively reduces the workgroup size by 2 and then
-      // re-generates the kernel with the reduced
-      // workgroup size until we find a size which is admissible for the kernel
-      // being run or reduce the wg size
-      // to the trivial case of 1 (which was separately verified to be accurate
-      // for the kernel being run)
-
-      while ((CurrentGroupSize() > 1))
-      {
-          // Re-generate the kernel code with the current group size
-          if (kernel) clReleaseKernel(kernel);
-          if (program) clReleaseProgram(program);
-          programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
-              + FunctionCode() + KernelCode(numDestItems);
-          programLine = programSource.c_str();
-          if (create_single_kernel_helper_with_build_options(
-                  context, &program, &kernel, 1, &programLine,
-                  "test_atomic_kernel", gOldAPI ? "" : nullptr))
-          {
-              return -1;
-          }
-          // Get work group size for the new kernel
-          error = clGetKernelWorkGroupInfo(kernel, deviceID,
-                                           CL_KERNEL_WORK_GROUP_SIZE,
-                                           sizeof(groupSize), &groupSize, NULL);
-          test_error(error,
-                     "Unable to obtain max work group size for device and "
-                     "kernel combo");
-
-          if (LocalMemory())
-          {
-              cl_ulong usedLocalMemory;
-              cl_ulong totalLocalMemory;
-              cl_uint maxWorkGroupSize;
-
-              error = clGetKernelWorkGroupInfo(
-                  kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
-                  sizeof(usedLocalMemory), &usedLocalMemory, NULL);
-              test_error(error, "clGetKernelWorkGroupInfo failed");
-
-              error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
-                                      sizeof(totalLocalMemory),
-                                      &totalLocalMemory, NULL);
-              test_error(error, "clGetDeviceInfo failed");
-
-              // We know that each work-group is going to use typeSize *
-              // deviceThreadCount bytes of local memory
-              // so pick the maximum value for deviceThreadCount that uses all
-              // the local memory.
-              maxWorkGroupSize =
-                  ((totalLocalMemory - usedLocalMemory) / typeSize);
-
-              if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
-          }
-          if (CurrentGroupSize() <= groupSize)
-              break;
-          else
-              CurrentGroupSize(CurrentGroupSize() / 2);
-      }
-    if(CurrentGroupSize() > deviceThreadCount)
-      CurrentGroupSize(deviceThreadCount);
-    if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
-      deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
-    threadCount = deviceThreadCount+hostThreadCount;
-  }
-  if (gDebug)
-  {
-      log_info("Program source:\n");
-      log_info("%s\n", programLine);
-  }
-  if(deviceThreadCount > 0)
-    log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
-  if(hostThreadCount > 0)
-    log_info("\t\t(host threads %u)\n", hostThreadCount);
-
-  refValues.resize(threadCount*NumNonAtomicVariablesPerThread());
-
-  // Generate ref data if we have a ref generator provided
-  d = init_genrand(gRandomSeed);
-  startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
-  if(GenerateRefs(threadCount, &startRefValues[0], d))
-  {
-    //copy ref values for host threads
-    memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
-  }
-  else
-  {
-    startRefValues.resize(0);
-  }
-  free_mtdata(d);
-  d = NULL;
-
-  // If we're given a num_results function, we need to determine how many result objects we need. If
-  // we don't have it, we assume it's just 1
-  // This is final value (exact thread count is known in this place)
-  numDestItems = NumResults(threadCount, deviceID);
-
-  destItems.resize(numDestItems);
-  for(cl_uint i = 0; i < numDestItems; i++)
-    destItems[i] = _startValue;
-
-  // Create main buffer with atomic variables (array size dependent on particular test)
-  if(UseSVM())
-  {
-    if(gUseHostPtr)
-      svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
-    else
-      svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
-    if(!svmAtomicBuffer)
-    {
-      log_error("ERROR: clSVMAlloc failed!\n");
-      return -1;
-    }
-    memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
-    streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
-                                typeSize * numDestItems, svmAtomicBuffer, NULL);
-  }
-  else
-  {
-      streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                                  typeSize * numDestItems, &destItems[0], NULL);
-  }
-  if (!streams[0])
-  {
-    log_error("ERROR: Creating output array failed!\n");
-    return -1;
-  }
-  // Create buffer for per-thread input/output data
-  if(UseSVM())
-  {
-    if(gUseHostPtr)
-      svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
-    else
-      svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
-    if(!svmDataBuffer)
-    {
-      log_error("ERROR: clSVMAlloc failed!\n");
-      return -1;
-    }
-    if(startRefValues.size())
-      memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
-    streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
-                                typeSize * threadCount
-                                    * NumNonAtomicVariablesPerThread(),
-                                svmDataBuffer, NULL);
-  }
-  else
-  {
-      streams[1] = clCreateBuffer(
-          context,
-          ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
-          typeSize * threadCount * NumNonAtomicVariablesPerThread(),
-          startRefValues.size() ? &startRefValues[0] : 0, NULL);
-  }
-  if (!streams[1])
-  {
-    log_error("ERROR: Creating reference array failed!\n");
-    return -1;
-  }
-  if(deviceThreadCount > 0)
-  {
-    cl_uint argInd = 0;
-    /* Set the arguments */
-    error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
-    test_error(error, "Unable to set kernel argument");
-    error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
-    test_error(error, "Unable to set indexed kernel argument");
-    error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
-    test_error(error, "Unable to set indexed kernel arguments");
-    error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
-    test_error(error, "Unable to set indexed kernel arguments");
-    if(LocalMemory())
-    {
-      error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
-      test_error(error, "Unable to set indexed local kernel argument");
-    }
-    if(LocalRefValues())
-    {
-      error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
-      test_error(error, "Unable to set indexed kernel argument");
-    }
-  }
-  /* Configure host threads */
-  std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
-  for(unsigned int t = 0; t < hostThreadCount; t++)
-  {
-    hostThreadContexts[t].test = this;
-    hostThreadContexts[t].tid = deviceThreadCount+t;
-    hostThreadContexts[t].threadCount = threadCount;
-    hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
-    hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
-  }
-
-  if(deviceThreadCount > 0)
-  {
-    /* Run the kernel */
-    threadNum[0] = deviceThreadCount;
-    groupSize = CurrentGroupSize();
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
-    test_error(error, "Unable to execute test kernel");
-    /* start device threads */
-    error = clFlush(queue);
-    test_error(error, "clFlush failed");
-  }
-
-  /* Start host threads and wait for finish */
-  if(hostThreadCount > 0)
-    ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);
-
-  if(UseSVM())
-  {
-    error = clFinish(queue);
-    test_error(error, "clFinish failed");
-    memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
-    memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
-  }
-  else
-  {
-    if(deviceThreadCount > 0)
-    {
-      error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
-      test_error(error, "Unable to read result value!");
-      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
-      test_error(error, "Unable to read reference values!");
-    }
-  }
-  bool dataVerified = false;
-  // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
-  // have one, it's because we're comparing ref values only
-  for(cl_uint i = 0; i < numDestItems; i++)
-  {
-    HostDataType expected;
-
-    if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
-      break; // no expected value function provided
-
-    if(expected != destItems[i])
-    {
-      std::stringstream logLine;
-      logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
-      log_error("%s", logLine.str().c_str());
-      for(i = 0; i < threadCount; i++)
-      {
-        logLine.str("");
-        logLine << " --- " << i << " - ";
-        if(startRefValues.size())
-          logLine << startRefValues[i] << " -> " << refValues[i];
-        else
-          logLine << refValues[i];
-        logLine << " --- ";
-        if(i < numDestItems)
-          logLine << destItems[i];
-        logLine << "\n";
-        log_info("%s", logLine.str().c_str());
-      }
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
-    }
-    dataVerified = true;
-  }
-
-  bool dataCorrect = false;
-  /* Use the verify function (if provided) to also check the results */
-  if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
-  {
-    if(!dataCorrect)
-    {
-      log_error("ERROR: Reference values did not validate!\n");
-      std::stringstream logLine;
-      for(cl_uint i = 0; i < threadCount; i++)
-      for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
-      {
-        logLine.str("");
-        logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
-        if(j == 0 && i < numDestItems)
-          logLine << destItems[i];
-        logLine << "\n";
-        log_info("%s", logLine.str().c_str());
-      }
-      if(!gDebug)
-      {
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    size_t threadNum[1];
+    clMemWrapper streams[2];
+    std::vector<HostAtomicType> destItems;
+    HostAtomicType *svmAtomicBuffer = 0;
+    std::vector<HostDataType> refValues, startRefValues;
+    HostDataType *svmDataBuffer = 0;
+    cl_uint deviceThreadCount, hostThreadCount, threadCount;
+    size_t groupSize = 0;
+    std::string programSource;
+    const char *programLine;
+    MTdata d;
+    size_t typeSize = DataType().Size(deviceID);
+
+    deviceThreadCount = _maxDeviceThreads;
+    hostThreadCount = MaxHostThreads();
+    threadCount = deviceThreadCount + hostThreadCount;
+
+    // log_info("\t%s %s%s...\n", local ? "local" : "global",
+    // DataType().AtomicTypeName(), memoryOrderScope.c_str());
+    log_info("\t%s...\n", SingleTestName().c_str());
+
+    if (!LocalMemory() && DeclaredInProgram()
+        && gNoGlobalVariables) // no support for program scope global variables
+    {
+        log_info("\t\tTest disabled\n");
+        return 0;
+    }
+    if (UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
+    {
+        log_info("\t\tTest disabled\n");
+        return 0;
+    }
+
+    // set up work sizes based on device capabilities and test configuration
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(groupSize), &groupSize, NULL);
+    test_error(error, "Unable to obtain max work group size for device");
+    CurrentGroupSize((cl_uint)groupSize);
+    if (CurrentGroupSize() > deviceThreadCount)
+        CurrentGroupSize(deviceThreadCount);
+    if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
+        deviceThreadCount =
+            CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
+    threadCount = deviceThreadCount + hostThreadCount;
+
+    // If we're given a num_results function, we need to determine how many
+    // result objects we need. This is the first assessment for current maximum
+    // number of threads (exact thread count is not known here)
+    // - needed for program source code generation (arrays of atomics declared
+    // in program)
+    cl_uint numDestItems = NumResults(threadCount, deviceID);
+
+    if (deviceThreadCount > 0)
+    {
+        // This loop iteratively reduces the workgroup size by 2 and then
+        // re-generates the kernel with the reduced
+        // workgroup size until we find a size which is admissible for the
+        // kernel being run or reduce the wg size to the trivial case of 1
+        // (which was separately verified to be accurate for the kernel being
+        // run)
+
+        while ((CurrentGroupSize() > 1))
+        {
+            // Re-generate the kernel code with the current group size
+            if (kernel) clReleaseKernel(kernel);
+            if (program) clReleaseProgram(program);
+            programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
+                + FunctionCode() + KernelCode(numDestItems);
+            programLine = programSource.c_str();
+            if (create_single_kernel_helper_with_build_options(
+                    context, &program, &kernel, 1, &programLine,
+                    "test_atomic_kernel", gOldAPI ? "" : nullptr))
+            {
+                return -1;
+            }
+            // Get work group size for the new kernel
+            error = clGetKernelWorkGroupInfo(
+                kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize),
+                &groupSize, NULL);
+            test_error(error,
+                       "Unable to obtain max work group size for device and "
+                       "kernel combo");
+
+            if (LocalMemory())
+            {
+                cl_ulong usedLocalMemory;
+                cl_ulong totalLocalMemory;
+                cl_uint maxWorkGroupSize;
+
+                error = clGetKernelWorkGroupInfo(
+                    kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
+                    sizeof(usedLocalMemory), &usedLocalMemory, NULL);
+                test_error(error, "clGetKernelWorkGroupInfo failed");
+
+                error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                                        sizeof(totalLocalMemory),
+                                        &totalLocalMemory, NULL);
+                test_error(error, "clGetDeviceInfo failed");
+
+                // We know that each work-group is going to use typeSize *
+                // deviceThreadCount bytes of local memory
+                // so pick the maximum value for deviceThreadCount that uses all
+                // the local memory.
+                maxWorkGroupSize =
+                    ((totalLocalMemory - usedLocalMemory) / typeSize);
+
+                if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
+            }
+            if (CurrentGroupSize() <= groupSize)
+                break;
+            else
+                CurrentGroupSize(CurrentGroupSize() / 2);
+        }
+        if (CurrentGroupSize() > deviceThreadCount)
+            CurrentGroupSize(deviceThreadCount);
+        if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
+            deviceThreadCount =
+                CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
+        threadCount = deviceThreadCount + hostThreadCount;
+    }
+    if (gDebug)
+    {
         log_info("Program source:\n");
         log_info("%s\n", programLine);
-      }
-      return -1;
-    }
-  }
-  else if(!dataVerified)
-  {
-    log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
-    return -1;
-  }
-
-  if(OldValueCheck() &&
-    !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
-                                             // 'old' value has been overwritten by previous clEnqueueNDRangeKernel
-  {
-    /* Re-write the starting value */
-    for(size_t i = 0; i < numDestItems; i++)
-      destItems[i] = _startValue;
-    refValues[0] = 0;
-    if(deviceThreadCount > 0)
-    {
-      error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
-      test_error(error, "Unable to write starting values!");
-
-      /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
-      threadNum[0] = 1;
-      error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
-      test_error(error, "Unable to execute test kernel");
-
-      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
-      test_error(error, "Unable to read reference values!");
+    }
+    if (deviceThreadCount > 0)
+        log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount,
+                 CurrentGroupSize());
+    if (hostThreadCount > 0)
+        log_info("\t\t(host threads %u)\n", hostThreadCount);
+
+    refValues.resize(threadCount * NumNonAtomicVariablesPerThread());
+
+    // Generate ref data if we have a ref generator provided
+    d = init_genrand(gRandomSeed);
+    startRefValues.resize(threadCount * NumNonAtomicVariablesPerThread());
+    if (GenerateRefs(threadCount, &startRefValues[0], d))
+    {
+        // copy ref values for host threads
+        memcpy(&refValues[0], &startRefValues[0],
+               sizeof(HostDataType) * threadCount
+                   * NumNonAtomicVariablesPerThread());
     }
     else
     {
-      /* Start host thread */
-      HostFunction(0, 1, &destItems[0], &refValues[0]);
+        startRefValues.resize(0);
     }
+    free_mtdata(d);
+    d = NULL;
+
+    // If we're given a num_results function, we need to determine how many
+    // result objects we need. If we don't have it, we assume it's just 1 This
+    // is final value (exact thread count is known in this place)
+    numDestItems = NumResults(threadCount, deviceID);
 
-    if(refValues[0] != _startValue)//destItems[0])
+    destItems.resize(numDestItems);
+    for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue;
+
+    // Create main buffer with atomic variables (array size dependent on
+    // particular test)
+    if (UseSVM())
     {
-      std::stringstream logLine;
-      logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
-        " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
-      log_error("%s", logLine.str().c_str());
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
-    }
-  }
-  if(UseSVM())
-  {
-    // the buffer object must first be released before the SVM buffer is freed
-    error = clReleaseMemObject(streams[0]);
-    streams[0] = 0;
-    test_error(error, "clReleaseMemObject failed");
-    if(gUseHostPtr)
-      free(svmAtomicBuffer);
+        if (gUseHostPtr)
+            svmAtomicBuffer = (HostAtomicType *)malloc(typeSize * numDestItems);
+        else
+            svmAtomicBuffer = (HostAtomicType *)clSVMAlloc(
+                context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+                typeSize * numDestItems, 0);
+        if (!svmAtomicBuffer)
+        {
+            log_error("ERROR: clSVMAlloc failed!\n");
+            return -1;
+        }
+        memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                           typeSize * numDestItems, svmAtomicBuffer, NULL);
+    }
+    else
+    {
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                           typeSize * numDestItems, &destItems[0], NULL);
+    }
+    if (!streams[0])
+    {
+        log_error("ERROR: Creating output array failed!\n");
+        return -1;
+    }
+    // Create buffer for per-thread input/output data
+    if (UseSVM())
+    {
+        if (gUseHostPtr)
+            svmDataBuffer = (HostDataType *)malloc(
+                typeSize * threadCount * NumNonAtomicVariablesPerThread());
+        else
+            svmDataBuffer = (HostDataType *)clSVMAlloc(
+                context,
+                CL_MEM_SVM_FINE_GRAIN_BUFFER
+                    | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS
+                                                       : 0),
+                typeSize * threadCount * NumNonAtomicVariablesPerThread(), 0);
+        if (!svmDataBuffer)
+        {
+            log_error("ERROR: clSVMAlloc failed!\n");
+            return -1;
+        }
+        if (startRefValues.size())
+            memcpy(svmDataBuffer, &startRefValues[0],
+                   typeSize * threadCount * NumNonAtomicVariablesPerThread());
+        streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                    typeSize * threadCount
+                                        * NumNonAtomicVariablesPerThread(),
+                                    svmDataBuffer, NULL);
+    }
     else
-      clSVMFree(context, svmAtomicBuffer);
-    error = clReleaseMemObject(streams[1]);
-    streams[1] = 0;
-    test_error(error, "clReleaseMemObject failed");
-    if(gUseHostPtr)
-      free(svmDataBuffer);
+    {
+        streams[1] = clCreateBuffer(
+            context,
+            ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR
+                                    : CL_MEM_READ_WRITE)),
+            typeSize * threadCount * NumNonAtomicVariablesPerThread(),
+            startRefValues.size() ? &startRefValues[0] : 0, NULL);
+    }
+    if (!streams[1])
+    {
+        log_error("ERROR: Creating reference array failed!\n");
+        return -1;
+    }
+    if (deviceThreadCount > 0)
+    {
+        cl_uint argInd = 0;
+        /* Set the arguments */
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
+        test_error(error, "Unable to set kernel argument");
+        error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems),
+                               &numDestItems);
+        test_error(error, "Unable to set indexed kernel argument");
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        if (LocalMemory())
+        {
+            error =
+                clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
+            test_error(error, "Unable to set indexed local kernel argument");
+        }
+        if (LocalRefValues())
+        {
+            error =
+                clSetKernelArg(kernel, argInd++,
+                               LocalRefValues() ? typeSize * CurrentGroupSize()
+                                       * NumNonAtomicVariablesPerThread()
+                                                : 1,
+                               NULL);
+            test_error(error, "Unable to set indexed kernel argument");
+        }
+    }
+    /* Configure host threads */
+    std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
+    for (unsigned int t = 0; t < hostThreadCount; t++)
+    {
+        hostThreadContexts[t].test = this;
+        hostThreadContexts[t].tid = deviceThreadCount + t;
+        hostThreadContexts[t].threadCount = threadCount;
+        hostThreadContexts[t].destMemory =
+            UseSVM() ? svmAtomicBuffer : &destItems[0];
+        hostThreadContexts[t].oldValues =
+            UseSVM() ? svmDataBuffer : &refValues[0];
+    }
+
+    if (deviceThreadCount > 0)
+    {
+        /* Run the kernel */
+        threadNum[0] = deviceThreadCount;
+        groupSize = CurrentGroupSize();
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
+                                       &groupSize, 0, NULL, NULL);
+        test_error(error, "Unable to execute test kernel");
+        /* start device threads */
+        error = clFlush(queue);
+        test_error(error, "clFlush failed");
+    }
+
+    /* Start host threads and wait for finish */
+    if (hostThreadCount > 0)
+        ThreadPool_Do(HostThreadFunction, hostThreadCount,
+                      &hostThreadContexts[0]);
+
+    if (UseSVM())
+    {
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+        memcpy(&destItems[0], svmAtomicBuffer, typeSize * numDestItems);
+        memcpy(&refValues[0], svmDataBuffer,
+               typeSize * threadCount * NumNonAtomicVariablesPerThread());
+    }
     else
-      clSVMFree(context, svmDataBuffer);
-  }
-  _passCount++;
-  return 0;
+    {
+        if (deviceThreadCount > 0)
+        {
+            error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                                        typeSize * numDestItems, &destItems[0],
+                                        0, NULL, NULL);
+            test_error(error, "Unable to read result value!");
+            error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                        typeSize * deviceThreadCount
+                                            * NumNonAtomicVariablesPerThread(),
+                                        &refValues[0], 0, NULL, NULL);
+            test_error(error, "Unable to read reference values!");
+        }
+    }
+    bool dataVerified = false;
+    // If we have an expectedFn, then we need to generate a final value to
+    // compare against. If we don't have one, it's because we're comparing ref
+    // values only
+    for (cl_uint i = 0; i < numDestItems; i++)
+    {
+        HostDataType expected;
+
+        if (!ExpectedValue(expected, threadCount,
+                           startRefValues.size() ? &startRefValues[0] : 0, i))
+            break; // no expected value function provided
+
+        if (expected != destItems[i])
+        {
+            std::stringstream logLine;
+            logLine << "ERROR: Result " << i
+                    << " from kernel does not validate! (should be " << expected
+                    << ", was " << destItems[i] << ")\n";
+            log_error("%s", logLine.str().c_str());
+            for (i = 0; i < threadCount; i++)
+            {
+                logLine.str("");
+                logLine << " --- " << i << " - ";
+                if (startRefValues.size())
+                    logLine << startRefValues[i] << " -> " << refValues[i];
+                else
+                    logLine << refValues[i];
+                logLine << " --- ";
+                if (i < numDestItems) logLine << destItems[i];
+                logLine << "\n";
+                log_info("%s", logLine.str().c_str());
+            }
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+        dataVerified = true;
+    }
+
+    bool dataCorrect = false;
+    /* Use the verify function (if provided) to also check the results */
+    if (VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
+    {
+        if (!dataCorrect)
+        {
+            log_error("ERROR: Reference values did not validate!\n");
+            std::stringstream logLine;
+            for (cl_uint i = 0; i < threadCount; i++)
+                for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
+                {
+                    logLine.str("");
+                    logLine
+                        << " --- " << i << " - "
+                        << refValues[i * NumNonAtomicVariablesPerThread() + j]
+                        << " --- ";
+                    if (j == 0 && i < numDestItems) logLine << destItems[i];
+                    logLine << "\n";
+                    log_info("%s", logLine.str().c_str());
+                }
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+    }
+    else if (!dataVerified)
+    {
+        log_error("ERROR: Test doesn't check total or refs; no values are "
+                  "verified!\n");
+        return -1;
+    }
+
+    if (OldValueCheck()
+        && !(DeclaredInProgram()
+             && !LocalMemory())) // don't test for programs scope global atomics
+                                 // 'old' value has been overwritten by previous
+                                 // clEnqueueNDRangeKernel
+    {
+        /* Re-write the starting value */
+        for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue;
+        refValues[0] = 0;
+        if (deviceThreadCount > 0)
+        {
+            error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
+                                         typeSize * numDestItems, &destItems[0],
+                                         0, NULL, NULL);
+            test_error(error, "Unable to write starting values!");
+
+            /* Run the kernel once for a single thread, so we can verify that
+             * the returned value is the original one */
+            threadNum[0] = 1;
+            error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
+                                           threadNum, 0, NULL, NULL);
+            test_error(error, "Unable to execute test kernel");
+
+            error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize,
+                                        &refValues[0], 0, NULL, NULL);
+            test_error(error, "Unable to read reference values!");
+        }
+        else
+        {
+            /* Start host thread */
+            HostFunction(0, 1, &destItems[0], &refValues[0]);
+        }
+
+        if (refValues[0] != _startValue) // destItems[0])
+        {
+            std::stringstream logLine;
+            logLine << "ERROR: atomic function operated correctly but did NOT "
+                       "return correct 'old' value "
+                       " (should have been "
+                    << destItems[0] << ", returned " << refValues[0] << ")!\n";
+            log_error("%s", logLine.str().c_str());
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+    }
+    if (UseSVM())
+    {
+        // the buffer object must first be released before the SVM buffer is
+        // freed. The Wrapper Class method reset() will do that
+        streams[0].reset();
+        if (gUseHostPtr)
+            free(svmAtomicBuffer);
+        else
+            clSVMFree(context, svmAtomicBuffer);
+        streams[1].reset();
+        if (gUseHostPtr)
+            free(svmDataBuffer);
+        else
+            clSVMFree(context, svmDataBuffer);
+    }
+    _passCount++;
+    return 0;
 }
 
 #endif //_COMMON_H_

From 43e1397468053608134816cbcf6e8496e91cb227 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Wed, 18 Aug 2021 11:11:30 +0100
Subject: [PATCH 118/158] Fix kernel source for
 cl_khr_suggested_local_work_size (#1300)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use ASCII '-' instead of unicode '–' as subtration operator.

Signed-off-by: Kévin Petit <kpet@free.fr>
---
 .../workgroups/test_wg_suggested_local_work_size.cpp      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
index 1dc1b39c9f..aa02391c5d 100644
--- a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
+++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp
@@ -42,9 +42,9 @@ const char* wg_scan_local_work_group_size = R"(
     {
         size_t linear_id;
 #if __OPENCL_VERSION__ < CL_VERSION_2_0
-        linear_id = ((get_global_id(2) – get_global_offset(2)) * get_global_size(1) * get_global_size(0)) + 
-                    ((get_global_id(1) – get_global_offset(1)) * get_global_size(0)) + 
-                    (get_global_id(0) – get_global_offset(0));
+        linear_id = ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) * get_global_size(0)) +
+                    ((get_global_id(1) - get_global_offset(1)) * get_global_size(0)) +
+                    (get_global_id(0) - get_global_offset(0));
 #else
         linear_id = get_global_linear_id();
 #endif
@@ -608,4 +608,4 @@ int test_work_group_suggested_local_size_3D(cl_device_id device,
              "global_work_offset passed\n");
 
     return err;
-}
\ No newline at end of file
+}

From 6c3c7e5266cddce9cfa466c02c14b43fee453110 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Thu, 19 Aug 2021 12:15:47 +0100
Subject: [PATCH 119/158] Remove unused definitions in CMakeLists.txt (#1302)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Kévin Petit <kpet@free.fr>
---
 CMakeLists.txt | 47 -----------------------------------------------
 1 file changed, 47 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d947ed1c8..a614649fd2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,12 +10,6 @@ set(CMAKE_C_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-if(CMAKE_BUILD_TYPE STREQUAL "release")
-    set (BUILD_FLAVOR "release")
-else(CMAKE_BUILD_TYPE STREQUAL "release")
-    set (BUILD_FLAVOR "debug")
-endif(CMAKE_BUILD_TYPE STREQUAL "release")
-
 add_definitions(-DCL_TARGET_OPENCL_VERSION=300)
 add_definitions(-DCL_USE_DEPRECATED_OPENCL_2_2_APIS=1)
 add_definitions(-DCL_USE_DEPRECATED_OPENCL_2_1_APIS=1)
@@ -29,14 +23,6 @@ if(USE_CL_EXPERIMENTAL)
   add_definitions(-DCL_EXPERIMENTAL)
 endif(USE_CL_EXPERIMENTAL)
 
-# Support both VS2008 and VS2012.
-set(BUILD_DIR "$ENV{ADRENO_DRIVER}/build")
-if(MSVC90)
-  set(VS_BUILD_DIR "${BUILD_DIR}/vs2008")
-else(MSVC110)
-  set(VS_BUILD_DIR "${BUILD_DIR}/vs2012")
-endif(MSVC90)
-
 #-----------------------------------------------------------
 # Default Configurable Test Set
 #-----------------------------------------------------------
@@ -164,38 +150,5 @@ include_directories(${CLConform_SOURCE_DIR}/test_common/harness
                     ${CLConform_SOURCE_DIR}/test_common/gl
                     ${CLConform_SOURCE_DIR}/test_common)
 
-if(CMAKE_BUILD_TYPE STREQUAL "release")
-    set (BUILD_FLAVOR "release")
-elseif (CMAKE_BUILD_TYPE STREQUAL "debug")
-    set (BUILD_FLAVOR "debug")
-endif(CMAKE_BUILD_TYPE STREQUAL "release")
-
-
 add_subdirectory(test_common)
 add_subdirectory(test_conformance)
-
-# Support both VS2008 and VS2012.
-set (DLL_FILES "${VS_BUILD_DIR}/Debug/*.dll")
-set (DST_DIR   "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/Debug/")
-
-if (WIN32)
-    set (COPY "echo")
-    add_custom_target(COPY_DLL${CONFORMANCE_SUFFIX} ALL
-                      COMMAND ${COPY} "${DLL_FILES}" "${DST_DIR}"
-                      COMMENT "Copying dll files.. ")
-else (WIN32)
-    set (COPY cp)
-    add_custom_target(COPY_DLL${CONFORMANCE_SUFFIX})
-endif(WIN32)
-
-set_property(TARGET COPY_DLL${CONFORMANCE_SUFFIX} PROPERTY FOLDER "CONFORMANCE${CONFORMANCE_SUFFIX}")
-
-if(WIN32)
-  add_custom_target( COPY_FILES${CONFORMANCE_SUFFIX} ALL
-                     COMMAND ${COPY} ${DLL_FILES} ${DST_DIR}
-                     COMMENT "Copying other files to output folder..." )
-else(WIN32)
-  add_custom_target( COPY_FILES${CONFORMANCE_SUFFIX} )
-endif(WIN32)
-
-set_property(TARGET COPY_FILES${CONFORMANCE_SUFFIX} PROPERTY FOLDER "CONFORMANCE${CONFORMANCE_SUFFIX}")

From 070f8c0c0ed8786e410584efa3fefa47bdab02c6 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Wed, 25 Aug 2021 02:14:58 -0700
Subject: [PATCH 120/158] add tests for cl_khr_integer_dot_product (#1276)

* cl_khr_integer_dot_product_tests

* remove emulated codepaths

* fix formatting

* address code review comments

* remove emulated codepaths again

* address one more review comment
---
 test_common/harness/integer_ops_test_info.h   |  91 +++++
 test_conformance/integer_ops/CMakeLists.txt   |   1 +
 test_conformance/integer_ops/main.cpp         | 236 +++++------
 test_conformance/integer_ops/procs.h          |   2 +
 .../integer_ops/test_integer_dot_product.cpp  | 380 ++++++++++++++++++
 5 files changed, 593 insertions(+), 117 deletions(-)
 create mode 100644 test_common/harness/integer_ops_test_info.h
 create mode 100644 test_conformance/integer_ops/test_integer_dot_product.cpp

diff --git a/test_common/harness/integer_ops_test_info.h b/test_common/harness/integer_ops_test_info.h
new file mode 100644
index 0000000000..c25843ddc0
--- /dev/null
+++ b/test_common/harness/integer_ops_test_info.h
@@ -0,0 +1,91 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef INTEGER_OPS_TEST_INFO_H
+#define INTEGER_OPS_TEST_INFO_H
+
+#include "conversions.h"
+
+// TODO: expand usage to other tests.
+
+template <typename T> struct TestInfo
+{
+};
+template <> struct TestInfo<cl_char>
+{
+    static const ExplicitType explicitType = kChar;
+    static constexpr const char* deviceTypeName = "char";
+    static constexpr const char* deviceTypeNameSigned = "char";
+    static constexpr const char* deviceTypeNameUnsigned = "uchar";
+};
+template <> struct TestInfo<cl_uchar>
+{
+    static const ExplicitType explicitType = kUChar;
+    static constexpr const char* deviceTypeName = "uchar";
+    static constexpr const char* deviceTypeNameSigned = "char";
+    static constexpr const char* deviceTypeNameUnsigned = "uchar";
+};
+template <> struct TestInfo<cl_short>
+{
+    static const ExplicitType explicitType = kShort;
+    static constexpr const char* deviceTypeName = "short";
+    static constexpr const char* deviceTypeNameSigned = "short";
+    static constexpr const char* deviceTypeNameUnsigned = "ushort";
+};
+template <> struct TestInfo<cl_ushort>
+{
+    static const ExplicitType explicitType = kUShort;
+    static constexpr const char* deviceTypeName = "ushort";
+    static constexpr const char* deviceTypeNameSigned = "short";
+    static constexpr const char* deviceTypeNameUnsigned = "ushort";
+};
+template <> struct TestInfo<cl_int>
+{
+    static const ExplicitType explicitType = kInt;
+    static constexpr const char* deviceTypeName = "int";
+    static constexpr const char* deviceTypeNameSigned = "int";
+    static constexpr const char* deviceTypeNameUnsigned = "uint";
+};
+template <> struct TestInfo<cl_uint>
+{
+    static const ExplicitType explicitType = kUInt;
+    static constexpr const char* deviceTypeName = "uint";
+    static constexpr const char* deviceTypeNameSigned = "int";
+    static constexpr const char* deviceTypeNameUnsigned = "uint";
+};
+template <> struct TestInfo<cl_long>
+{
+    static const ExplicitType explicitType = kLong;
+    static constexpr const char* deviceTypeName = "long";
+    static constexpr const char* deviceTypeNameSigned = "long";
+    static constexpr const char* deviceTypeNameUnsigned = "ulong";
+};
+template <> struct TestInfo<cl_ulong>
+{
+    static const ExplicitType explicitType = kULong;
+    static constexpr const char* deviceTypeName = "ulong";
+    static constexpr const char* deviceTypeNameSigned = "long";
+    static constexpr const char* deviceTypeNameUnsigned = "ulong";
+};
+
+template <typename T>
+static void fill_vector_with_random_data(std::vector<T>& v)
+{
+    MTdataHolder d(gRandomSeed);
+    generate_random_data(TestInfo<T>::explicitType, v.size(), d, v.data());
+}
+
+#endif /* INTEGER_OPS_TEST_INFO_H */
diff --git a/test_conformance/integer_ops/CMakeLists.txt b/test_conformance/integer_ops/CMakeLists.txt
index a045ef81c1..5344eabc07 100644
--- a/test_conformance/integer_ops/CMakeLists.txt
+++ b/test_conformance/integer_ops/CMakeLists.txt
@@ -11,6 +11,7 @@ set(${MODULE_NAME}_SOURCES
     test_unary_ops.cpp
     verification_and_generation_functions.cpp
     test_popcount.cpp
+    test_integer_dot_product.cpp
 )
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/integer_ops/main.cpp b/test_conformance/integer_ops/main.cpp
index 00e91661c3..e57cffd949 100644
--- a/test_conformance/integer_ops/main.cpp
+++ b/test_conformance/integer_ops/main.cpp
@@ -25,127 +25,129 @@
 #endif
 
 test_definition test_list[] = {
-    ADD_TEST( integer_clz ),
-    ADD_TEST_VERSION( integer_ctz,  Version(2, 0)),
-    ADD_TEST( integer_hadd ),
-    ADD_TEST( integer_rhadd ),
-    ADD_TEST( integer_mul_hi ),
-    ADD_TEST( integer_rotate ),
-    ADD_TEST( integer_clamp ),
-    ADD_TEST( integer_mad_sat ),
-    ADD_TEST( integer_mad_hi ),
-    ADD_TEST( integer_min ),
-    ADD_TEST( integer_max ),
-    ADD_TEST( integer_upsample ),
-
-    ADD_TEST( integer_abs ),
-    ADD_TEST( integer_abs_diff ),
-    ADD_TEST( integer_add_sat ),
-    ADD_TEST( integer_sub_sat ),
-
-    ADD_TEST( integer_addAssign ),
-    ADD_TEST( integer_subtractAssign ),
-    ADD_TEST( integer_multiplyAssign ),
-    ADD_TEST( integer_divideAssign ),
-    ADD_TEST( integer_moduloAssign ),
-    ADD_TEST( integer_andAssign ),
-    ADD_TEST( integer_orAssign ),
-    ADD_TEST( integer_exclusiveOrAssign ),
-
-    ADD_TEST( unary_ops_increment ),
-    ADD_TEST( unary_ops_decrement ),
-    ADD_TEST( unary_ops_full ),
-
-    ADD_TEST( integer_mul24 ),
-    ADD_TEST( integer_mad24 ),
-
-    ADD_TEST( long_math ),
-    ADD_TEST( long_logic ),
-    ADD_TEST( long_shift ),
-    ADD_TEST( long_compare ),
-
-    ADD_TEST( ulong_math ),
-    ADD_TEST( ulong_logic ),
-    ADD_TEST( ulong_shift ),
-    ADD_TEST( ulong_compare ),
-
-    ADD_TEST( int_math ),
-    ADD_TEST( int_logic ),
-    ADD_TEST( int_shift ),
-    ADD_TEST( int_compare ),
-
-    ADD_TEST( uint_math ),
-    ADD_TEST( uint_logic ),
-    ADD_TEST( uint_shift ),
-    ADD_TEST( uint_compare ),
-
-    ADD_TEST( short_math ),
-    ADD_TEST( short_logic ),
-    ADD_TEST( short_shift ),
-    ADD_TEST( short_compare ),
-
-    ADD_TEST( ushort_math ),
-    ADD_TEST( ushort_logic ),
-    ADD_TEST( ushort_shift ),
-    ADD_TEST( ushort_compare ),
-
-    ADD_TEST( char_math ),
-    ADD_TEST( char_logic ),
-    ADD_TEST( char_shift ),
-    ADD_TEST( char_compare ),
-
-    ADD_TEST( uchar_math ),
-    ADD_TEST( uchar_logic ),
-    ADD_TEST( uchar_shift ),
-    ADD_TEST( uchar_compare ),
-
-    ADD_TEST( popcount ),
+    ADD_TEST(integer_clz),
+    ADD_TEST_VERSION(integer_ctz, Version(2, 0)),
+    ADD_TEST(integer_hadd),
+    ADD_TEST(integer_rhadd),
+    ADD_TEST(integer_mul_hi),
+    ADD_TEST(integer_rotate),
+    ADD_TEST(integer_clamp),
+    ADD_TEST(integer_mad_sat),
+    ADD_TEST(integer_mad_hi),
+    ADD_TEST(integer_min),
+    ADD_TEST(integer_max),
+    ADD_TEST(integer_upsample),
+
+    ADD_TEST(integer_abs),
+    ADD_TEST(integer_abs_diff),
+    ADD_TEST(integer_add_sat),
+    ADD_TEST(integer_sub_sat),
+
+    ADD_TEST(integer_addAssign),
+    ADD_TEST(integer_subtractAssign),
+    ADD_TEST(integer_multiplyAssign),
+    ADD_TEST(integer_divideAssign),
+    ADD_TEST(integer_moduloAssign),
+    ADD_TEST(integer_andAssign),
+    ADD_TEST(integer_orAssign),
+    ADD_TEST(integer_exclusiveOrAssign),
+
+    ADD_TEST(unary_ops_increment),
+    ADD_TEST(unary_ops_decrement),
+    ADD_TEST(unary_ops_full),
+
+    ADD_TEST(integer_mul24),
+    ADD_TEST(integer_mad24),
+
+    ADD_TEST(long_math),
+    ADD_TEST(long_logic),
+    ADD_TEST(long_shift),
+    ADD_TEST(long_compare),
+
+    ADD_TEST(ulong_math),
+    ADD_TEST(ulong_logic),
+    ADD_TEST(ulong_shift),
+    ADD_TEST(ulong_compare),
+
+    ADD_TEST(int_math),
+    ADD_TEST(int_logic),
+    ADD_TEST(int_shift),
+    ADD_TEST(int_compare),
+
+    ADD_TEST(uint_math),
+    ADD_TEST(uint_logic),
+    ADD_TEST(uint_shift),
+    ADD_TEST(uint_compare),
+
+    ADD_TEST(short_math),
+    ADD_TEST(short_logic),
+    ADD_TEST(short_shift),
+    ADD_TEST(short_compare),
+
+    ADD_TEST(ushort_math),
+    ADD_TEST(ushort_logic),
+    ADD_TEST(ushort_shift),
+    ADD_TEST(ushort_compare),
+
+    ADD_TEST(char_math),
+    ADD_TEST(char_logic),
+    ADD_TEST(char_shift),
+    ADD_TEST(char_compare),
+
+    ADD_TEST(uchar_math),
+    ADD_TEST(uchar_logic),
+    ADD_TEST(uchar_shift),
+    ADD_TEST(uchar_compare),
+
+    ADD_TEST(popcount),
 
     // Quick
-    ADD_TEST( quick_long_math ),
-    ADD_TEST( quick_long_logic ),
-    ADD_TEST( quick_long_shift ),
-    ADD_TEST( quick_long_compare ),
-
-    ADD_TEST( quick_ulong_math ),
-    ADD_TEST( quick_ulong_logic ),
-    ADD_TEST( quick_ulong_shift ),
-    ADD_TEST( quick_ulong_compare ),
-
-    ADD_TEST( quick_int_math ),
-    ADD_TEST( quick_int_logic ),
-    ADD_TEST( quick_int_shift ),
-    ADD_TEST( quick_int_compare ),
-
-    ADD_TEST( quick_uint_math ),
-    ADD_TEST( quick_uint_logic ),
-    ADD_TEST( quick_uint_shift ),
-    ADD_TEST( quick_uint_compare ),
-
-    ADD_TEST( quick_short_math ),
-    ADD_TEST( quick_short_logic ),
-    ADD_TEST( quick_short_shift ),
-    ADD_TEST( quick_short_compare ),
-
-    ADD_TEST( quick_ushort_math ),
-    ADD_TEST( quick_ushort_logic ),
-    ADD_TEST( quick_ushort_shift ),
-    ADD_TEST( quick_ushort_compare ),
-
-    ADD_TEST( quick_char_math ),
-    ADD_TEST( quick_char_logic ),
-    ADD_TEST( quick_char_shift ),
-    ADD_TEST( quick_char_compare ),
-
-    ADD_TEST( quick_uchar_math ),
-    ADD_TEST( quick_uchar_logic ),
-    ADD_TEST( quick_uchar_shift ),
-    ADD_TEST( quick_uchar_compare ),
-
-    ADD_TEST( vector_scalar ),
+    ADD_TEST(quick_long_math),
+    ADD_TEST(quick_long_logic),
+    ADD_TEST(quick_long_shift),
+    ADD_TEST(quick_long_compare),
+
+    ADD_TEST(quick_ulong_math),
+    ADD_TEST(quick_ulong_logic),
+    ADD_TEST(quick_ulong_shift),
+    ADD_TEST(quick_ulong_compare),
+
+    ADD_TEST(quick_int_math),
+    ADD_TEST(quick_int_logic),
+    ADD_TEST(quick_int_shift),
+    ADD_TEST(quick_int_compare),
+
+    ADD_TEST(quick_uint_math),
+    ADD_TEST(quick_uint_logic),
+    ADD_TEST(quick_uint_shift),
+    ADD_TEST(quick_uint_compare),
+
+    ADD_TEST(quick_short_math),
+    ADD_TEST(quick_short_logic),
+    ADD_TEST(quick_short_shift),
+    ADD_TEST(quick_short_compare),
+
+    ADD_TEST(quick_ushort_math),
+    ADD_TEST(quick_ushort_logic),
+    ADD_TEST(quick_ushort_shift),
+    ADD_TEST(quick_ushort_compare),
+
+    ADD_TEST(quick_char_math),
+    ADD_TEST(quick_char_logic),
+    ADD_TEST(quick_char_shift),
+    ADD_TEST(quick_char_compare),
+
+    ADD_TEST(quick_uchar_math),
+    ADD_TEST(quick_uchar_logic),
+    ADD_TEST(quick_uchar_shift),
+    ADD_TEST(quick_uchar_compare),
+
+    ADD_TEST(vector_scalar),
+
+    ADD_TEST(integer_dot_product),
 };
 
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 
 void fill_test_values( cl_long *outBufferA, cl_long *outBufferB, size_t numElements, MTdata d )
 {
diff --git a/test_conformance/integer_ops/procs.h b/test_conformance/integer_ops/procs.h
index d5b77e704b..82311fb9cc 100644
--- a/test_conformance/integer_ops/procs.h
+++ b/test_conformance/integer_ops/procs.h
@@ -141,3 +141,5 @@ extern int test_unary_ops_decrement(cl_device_id deviceID, cl_context context, c
 
 extern int test_vector_scalar(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 
+extern int test_integer_dot_product(cl_device_id deviceID, cl_context context,
+                                    cl_command_queue queue, int num_elements);
diff --git a/test_conformance/integer_ops/test_integer_dot_product.cpp b/test_conformance/integer_ops/test_integer_dot_product.cpp
new file mode 100644
index 0000000000..b5378ae093
--- /dev/null
+++ b/test_conformance/integer_ops/test_integer_dot_product.cpp
@@ -0,0 +1,380 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// This is needed for std::numeric_limits<>::min() and max() to work on Windows.
+#if defined(_WIN32)
+#define NOMINMAX
+#endif
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "procs.h"
+#include "harness/integer_ops_test_info.h"
+#include "harness/testHarness.h"
+
+template <size_t N, typename DstType, typename SrcTypeA, typename SrcTypeB>
+static void
+calculate_reference(std::vector<DstType>& ref, const std::vector<SrcTypeA>& a,
+                    const std::vector<SrcTypeB>& b, const bool AccSat = false,
+                    const std::vector<DstType>& acc = {})
+{
+    assert(a.size() == b.size());
+    assert(AccSat == false || acc.size() == a.size() / N);
+
+    ref.resize(a.size() / N);
+    for (size_t r = 0; r < ref.size(); r++)
+    {
+        cl_long result = AccSat ? acc[r] : 0;
+        for (size_t c = 0; c < N; c++)
+        {
+            // OK to assume no overflow?
+            result += a[r * N + c] * b[r * N + c];
+        }
+        if (AccSat && result > std::numeric_limits<DstType>::max())
+        {
+            result = std::numeric_limits<DstType>::max();
+        }
+        ref[r] = static_cast<DstType>(result);
+    }
+}
+
+template <typename SrcTypeA, typename SrcTypeB>
+void generate_inputs_with_special_values(std::vector<SrcTypeA>& a,
+                                         std::vector<SrcTypeB>& b)
+{
+    const std::vector<SrcTypeA> specialValuesA(
+        { static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min()),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() + 1),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() / 2), 0,
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() / 2),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() - 1),
+          static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max()) });
+    const std::vector<SrcTypeB> specialValuesB(
+        { static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min()),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() + 1),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() / 2), 0,
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() / 2),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() - 1),
+          static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max()) });
+
+    size_t count = 0;
+    for (auto svA : specialValuesA)
+    {
+        for (auto svB : specialValuesB)
+        {
+            a[count] = svA;
+            b[count] = svB;
+            ++count;
+        }
+    }
+
+    // Generate random data for the rest of the inputs:
+    MTdataHolder d(gRandomSeed);
+    generate_random_data(TestInfo<SrcTypeA>::explicitType, a.size() - count, d,
+                         a.data() + count);
+    generate_random_data(TestInfo<SrcTypeB>::explicitType, b.size() - count, d,
+                         b.data() + count);
+}
+
+template <typename SrcType>
+void generate_acc_sat_inputs(std::vector<SrcType>& acc)
+{
+    // First generate random data:
+    fill_vector_with_random_data(acc);
+
+    // Now go through the generated data, and make every other element large.
+    // This ensures we have some elements that need saturation.
+    for (size_t i = 0; i < acc.size(); i += 2)
+    {
+        acc[i] = std::numeric_limits<SrcType>::max() - acc[i];
+    }
+}
+
+template <typename T> struct PackedTestInfo
+{
+    static constexpr const char* deviceTypeName = "UNSUPPORTED";
+};
+template <> struct PackedTestInfo<cl_char>
+{
+    static constexpr const char* deviceTypeName = "int";
+};
+template <> struct PackedTestInfo<cl_uchar>
+{
+    static constexpr const char* deviceTypeName = "uint";
+};
+
+static constexpr const char* kernel_source_dot = R"CLC(
+__kernel void test_dot(__global DSTTYPE* dst, __global SRCTYPEA* a, __global SRCTYPEB* b)
+{
+    int index = get_global_id(0);
+    dst[index] = DOT(a[index], b[index]);
+}
+)CLC";
+
+static constexpr const char* kernel_source_dot_acc_sat = R"CLC(
+__kernel void test_dot_acc_sat(
+    __global DSTTYPE* dst,
+    __global SRCTYPEA* a, __global SRCTYPEB* b, __global DSTTYPE* acc)
+{
+    int index = get_global_id(0);
+    dst[index] = DOT_ACC_SAT(a[index], b[index], acc[index]);
+}
+)CLC";
+
+template <typename DstType, typename SrcTypeA, typename SrcTypeB, size_t N>
+static int test_case_dot(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements, bool packed,
+                         bool sat)
+{
+    log_info("    testing %s = dot%s%s(%s, %s)\n",
+             std::numeric_limits<DstType>::is_signed ? "signed" : "unsigned",
+             sat ? "_acc_sat" : "", packed ? "_packed" : "",
+             std::numeric_limits<SrcTypeA>::is_signed ? "signed" : "unsigned",
+             std::numeric_limits<SrcTypeB>::is_signed ? "signed" : "unsigned");
+
+    cl_int error = CL_SUCCESS;
+
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+
+    std::string buildOptions;
+    buildOptions += " -DDSTTYPE=";
+    buildOptions += TestInfo<DstType>::deviceTypeName;
+    buildOptions += " -DSRCTYPEA=";
+    buildOptions += packed
+        ? PackedTestInfo<SrcTypeA>::deviceTypeName
+        : TestInfo<SrcTypeA>::deviceTypeName + std::to_string(N);
+    buildOptions += " -DSRCTYPEB=";
+    buildOptions += packed
+        ? PackedTestInfo<SrcTypeB>::deviceTypeName
+        : TestInfo<SrcTypeB>::deviceTypeName + std::to_string(N);
+    std::string packedSuffix;
+    packedSuffix += std::numeric_limits<SrcTypeA>::is_signed ? "s" : "u";
+    packedSuffix += std::numeric_limits<SrcTypeB>::is_signed ? "s" : "u";
+    packedSuffix += std::numeric_limits<DstType>::is_signed ? "_int" : "_uint";
+    if (sat)
+    {
+        buildOptions += packed
+            ? " -DDOT_ACC_SAT=dot_acc_sat_4x8packed_" + packedSuffix
+            : " -DDOT_ACC_SAT=dot_acc_sat";
+    }
+    else
+    {
+        buildOptions +=
+            packed ? " -DDOT=dot_4x8packed_" + packedSuffix : " -DDOT=dot";
+    }
+
+    std::vector<SrcTypeA> a(N * num_elements);
+    std::vector<SrcTypeB> b(N * num_elements);
+    generate_inputs_with_special_values(a, b);
+
+    std::vector<DstType> acc;
+    if (sat)
+    {
+        acc.resize(num_elements);
+        generate_acc_sat_inputs(acc);
+    }
+
+    std::vector<DstType> reference(num_elements);
+    calculate_reference<N>(reference, a, b, sat, acc);
+
+    const char* source = sat ? kernel_source_dot_acc_sat : kernel_source_dot;
+    const char* name = sat ? "test_dot_acc_sat" : "test_dot";
+    error = create_single_kernel_helper(context, &program, &kernel, 1, &source,
+                                        name, buildOptions.c_str());
+    test_error(error, "Unable to create test kernel");
+
+    clMemWrapper dst = clCreateBuffer(
+        context, 0, reference.size() * sizeof(DstType), NULL, &error);
+    test_error(error, "Unable to create output buffer");
+
+    clMemWrapper srcA =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       a.size() * sizeof(SrcTypeA), a.data(), &error);
+    test_error(error, "Unable to create srcA buffer");
+
+    clMemWrapper srcB =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       b.size() * sizeof(SrcTypeB), b.data(), &error);
+    test_error(error, "Unable to create srcB buffer");
+
+    clMemWrapper srcAcc;
+    if (sat)
+    {
+        srcAcc =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                           acc.size() * sizeof(DstType), acc.data(), &error);
+        test_error(error, "Unable to create acc buffer");
+    }
+
+    error = clSetKernelArg(kernel, 0, sizeof(dst), &dst);
+    test_error(error, "Unable to set output buffer kernel arg");
+
+    error = clSetKernelArg(kernel, 1, sizeof(srcA), &srcA);
+    test_error(error, "Unable to set srcA buffer kernel arg");
+
+    error = clSetKernelArg(kernel, 2, sizeof(srcB), &srcB);
+    test_error(error, "Unable to set srcB buffer kernel arg");
+
+    if (sat)
+    {
+        error = clSetKernelArg(kernel, 3, sizeof(srcAcc), &srcAcc);
+        test_error(error, "Unable to set acc buffer kernel arg");
+    }
+
+    size_t global_work_size[] = { reference.size() };
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
+                                   NULL, 0, NULL, NULL);
+    test_error(error, "Unable to enqueue test kernel");
+
+    error = clFinish(queue);
+    test_error(error, "clFinish failed after test kernel");
+
+    std::vector<DstType> results(reference.size(), 99);
+    error = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0,
+                                results.size() * sizeof(DstType),
+                                results.data(), 0, NULL, NULL);
+    test_error(error, "Unable to read data after test kernel");
+
+    if (results != reference)
+    {
+        log_error("Result buffer did not match reference buffer!\n");
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
+
+template <typename SrcType, typename DstType, size_t N>
+static int test_vectype(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
+{
+    int result = TEST_PASS;
+
+    typedef typename std::make_signed<SrcType>::type SSrcType;
+    typedef typename std::make_signed<DstType>::type SDstType;
+
+    typedef typename std::make_unsigned<SrcType>::type USrcType;
+    typedef typename std::make_unsigned<DstType>::type UDstType;
+
+    // dot testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, false);
+
+    // dot_acc_sat testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, false, true);
+
+    return result;
+}
+
+template <typename SrcType, typename DstType, size_t N>
+static int test_vectype_packed(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements)
+{
+    int result = TEST_PASS;
+
+    typedef typename std::make_signed<SrcType>::type SSrcType;
+    typedef typename std::make_signed<DstType>::type SDstType;
+
+    typedef typename std::make_unsigned<SrcType>::type USrcType;
+    typedef typename std::make_unsigned<DstType>::type UDstType;
+
+    // packed dot testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, false);
+
+    // packed dot_acc_sat testing:
+    result |= test_case_dot<UDstType, USrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+    result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+    result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+    result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
+        deviceID, context, queue, num_elements, true, true);
+
+    return result;
+}
+
+int test_integer_dot_product(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, int num_elements)
+{
+    if (!is_extension_available(deviceID, "cl_khr_integer_dot_product"))
+    {
+        log_info("cl_khr_integer_dot_product is not supported\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    cl_int error = CL_SUCCESS;
+    int result = TEST_PASS;
+
+    cl_device_integer_dot_product_capabilities_khr dotCaps = 0;
+    error = clGetDeviceInfo(deviceID,
+                            CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR,
+                            sizeof(dotCaps), &dotCaps, NULL);
+    test_error(
+        error,
+        "Unable to query CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR");
+    test_assert_error(
+        dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR,
+        "When cl_khr_integer_dot_product is supported "
+        "CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR must be "
+        "supported");
+
+    if (dotCaps
+        & ~(CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR
+            | CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR))
+    {
+        log_info("NOTE: found an unknown / untested capability!\n");
+    }
+
+    if (dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR)
+    {
+        result |= test_vectype<cl_uchar, cl_uint, 4>(deviceID, context, queue,
+                                                     num_elements);
+    }
+
+    if (dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR)
+    {
+        result |= test_vectype_packed<cl_uchar, cl_uint, 4>(
+            deviceID, context, queue, num_elements);
+    }
+
+    return result;
+}

From 39fdb462be7ea4bf2c2b2c6d23e84a70c3def78d Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Sat, 28 Aug 2021 02:21:34 -0700
Subject: [PATCH 121/158] define NOMINMAX in the CMakefile to fix std::min and
 std::max on MSVC (#1308)

---
 CMakeLists.txt                                            | 2 ++
 test_common/harness/kernelHelpers.cpp                     | 2 +-
 test_common/harness/os_helpers.cpp                        | 3 ---
 test_conformance/basic/test_async_copy2D.cpp              | 4 ++--
 test_conformance/basic/test_async_copy3D.cpp              | 4 ++--
 test_conformance/integer_ops/test_integer_dot_product.cpp | 5 -----
 6 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a614649fd2..04551dfb34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,8 @@ endif()
 if(MSVC)
     # Don't warn when using standard non-secure functions.
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+    # Fix std::min and std::max handling with windows.harness.
+    add_compile_definitions(NOMINMAX)
 endif()
 
 if( WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel" )
diff --git a/test_common/harness/kernelHelpers.cpp b/test_common/harness/kernelHelpers.cpp
index 18f51cbe26..1d1f8d8c4c 100644
--- a/test_common/harness/kernelHelpers.cpp
+++ b/test_common/harness/kernelHelpers.cpp
@@ -1707,7 +1707,7 @@ Version get_max_OpenCL_C_for_context(cl_context context)
                       else
                       {
                           current_version =
-                              (std::min)(device_version, current_version);
+                              std::min(device_version, current_version);
                       }
                   });
     return current_version;
diff --git a/test_common/harness/os_helpers.cpp b/test_common/harness/os_helpers.cpp
index daf2195851..8fc911083b 100644
--- a/test_common/harness/os_helpers.cpp
+++ b/test_common/harness/os_helpers.cpp
@@ -333,9 +333,6 @@ std::string exe_dir()
 
 
 #include <windows.h>
-#if defined(max)
-#undef max
-#endif
 
 #include <cctype>
 #include <algorithm>
diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp
index 9fbdcb6e39..fafcac837a 100644
--- a/test_conformance/basic/test_async_copy2D.cpp
+++ b/test_conformance/basic/test_async_copy2D.cpp
@@ -203,13 +203,13 @@ int test_copy2D(cl_device_id deviceID, cl_context context,
         / (numElementsPerLine + srcStride);
     size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride)
         / (numElementsPerLine + dstStride);
-    size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut);
+    size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut);
     size_t maxLocalWorkgroups =
         maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
 
     size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
         - (localIsDst ? dstStride : srcStride);
-    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
+    size_t numberOfLocalWorkgroups = std::min(1111, (int)maxLocalWorkgroups);
     size_t totalLines =
         numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
     size_t inBufferSize = elementSize
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp
index 252159bc2b..2b184ee596 100644
--- a/test_conformance/basic/test_async_copy3D.cpp
+++ b/test_conformance/basic/test_async_copy3D.cpp
@@ -230,13 +230,13 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
     size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride)
         / ((numLines * numElementsPerLine + numLines * dstLineStride)
            + dstPlaneStride);
-    size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut);
+    size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut);
     size_t maxLocalWorkgroups =
         maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
 
     size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
         - (localIsDst ? dstPlaneStride : srcPlaneStride);
-    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
+    size_t numberOfLocalWorkgroups = std::min(1111, (int)maxLocalWorkgroups);
     size_t totalPlanes =
         numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
     size_t inBufferSize = elementSize
diff --git a/test_conformance/integer_ops/test_integer_dot_product.cpp b/test_conformance/integer_ops/test_integer_dot_product.cpp
index b5378ae093..be25b320ab 100644
--- a/test_conformance/integer_ops/test_integer_dot_product.cpp
+++ b/test_conformance/integer_ops/test_integer_dot_product.cpp
@@ -14,11 +14,6 @@
 // limitations under the License.
 //
 
-// This is needed for std::numeric_limits<>::min() and max() to work on Windows.
-#if defined(_WIN32)
-#define NOMINMAX
-#endif
-
 #include <algorithm>
 #include <limits>
 #include <numeric>

From 7cfd3a6033f547905da40c06fae32b9337df0b03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Sun, 29 Aug 2021 23:12:52 +0100
Subject: [PATCH 122/158] Report failures in  simple_{read,write}_image_pitch
 tests (#1309)

---
 test_conformance/basic/test_simple_image_pitch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_conformance/basic/test_simple_image_pitch.cpp b/test_conformance/basic/test_simple_image_pitch.cpp
index 1cd82b6f8d..2eb43b3a51 100644
--- a/test_conformance/basic/test_simple_image_pitch.cpp
+++ b/test_conformance/basic/test_simple_image_pitch.cpp
@@ -83,7 +83,7 @@ int test_simple_read_image_pitch(cl_device_id device, cl_context cl_context_, cl
   free(host_image);
   free(host_buffer);
 
-  return CL_SUCCESS;
+  return errors == 0 ? TEST_PASS : TEST_FAIL;
 }
 
 int test_simple_write_image_pitch(cl_device_id device, cl_context cl_context_, cl_command_queue q, int num_elements)
@@ -149,5 +149,5 @@ int test_simple_write_image_pitch(cl_device_id device, cl_context cl_context_, c
   free(host_image);
   free(host_buffer);
 
-  return CL_SUCCESS;
+  return errors == 0 ? TEST_PASS : TEST_FAIL;
 }

From e27a97fbd81b6b426a29857a3e1c04d37255931c Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Tue, 31 Aug 2021 16:53:55 +0200
Subject: [PATCH 123/158] Add cl_khr_integer_dot_product to known extensions in
 test compiler. (#1316)

---
 .../compiler/test_compiler_defines_for_extensions.cpp          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index a1d8d8bdb5..de30e06b20 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -50,6 +50,7 @@ const char *known_extensions[] = {
     "cl_khr_subgroup_shuffle_relative",
     "cl_khr_subgroup_clustered_reduce",
     "cl_khr_extended_bit_ops",
+    "cl_khr_integer_dot_product",
     // API-only extensions after this point.  If you add above here, modify
     // first_API_extension below.
     "cl_khr_icd",
@@ -77,7 +78,7 @@ const char *known_extensions[] = {
 };
 
 size_t num_known_extensions = sizeof(known_extensions) / sizeof(char *);
-size_t first_API_extension = 28;
+size_t first_API_extension = 29;
 
 const char *known_embedded_extensions[] = {
     "cles_khr_int64",

From 995c7dbfbbb7b38c4ad6ce59d66b01b53ef031b2 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Tue, 31 Aug 2021 11:44:17 -0700
Subject: [PATCH 124/158] suppress MSVC strdup warning (#1314)

---
 CMakeLists.txt             | 2 ++
 test_common/CMakeLists.txt | 5 -----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04551dfb34..7b307a119d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,8 @@ endif()
 if(MSVC)
     # Don't warn when using standard non-secure functions.
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+    # Don't warn about using the portable "strdup" function.
+    add_compile_definitions(_CRT_NONSTDC_NO_DEPRECATE)
     # Fix std::min and std::max handling with windows.harness.
     add_compile_definitions(NOMINMAX)
 endif()
diff --git a/test_common/CMakeLists.txt b/test_common/CMakeLists.txt
index 61580300b6..b05053459f 100644
--- a/test_common/CMakeLists.txt
+++ b/test_common/CMakeLists.txt
@@ -21,8 +21,3 @@ set(HARNESS_SOURCES
 )
 
 add_library(harness STATIC ${HARNESS_SOURCES})
-
-if(MSVC)
-    # Don't warn about using the portable "strdup" function.
-    target_compile_definitions(harness PRIVATE _CRT_NONSTDC_NO_DEPRECATE)
-endif()
\ No newline at end of file

From 0601c6f7658c80af50d6f6a2ac947682d75bcd50 Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Tue, 31 Aug 2021 14:45:24 -0400
Subject: [PATCH 125/158] Add missing include for gRandomSeed (#1307)

---
 test_common/harness/integer_ops_test_info.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_common/harness/integer_ops_test_info.h b/test_common/harness/integer_ops_test_info.h
index c25843ddc0..ad7b303b47 100644
--- a/test_common/harness/integer_ops_test_info.h
+++ b/test_common/harness/integer_ops_test_info.h
@@ -18,6 +18,7 @@
 #define INTEGER_OPS_TEST_INFO_H
 
 #include "conversions.h"
+#include "testHarness.h"
 
 // TODO: expand usage to other tests.
 

From 34e47322db205d3c8c972ddebbf51bb4122e45f5 Mon Sep 17 00:00:00 2001
From: "Senran (Stephen) Zhang" <senran.zhang@intel.com>
Date: Tue, 7 Sep 2021 00:14:36 +0800
Subject: [PATCH 126/158] Limit workgroup size for atomics tests (#1197)

* Limit workgroup size for atomics tests

This avoids extremely large local buffer size and slow run

* Always limit workgroup size
---
 test_conformance/atomics/test_atomics.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_conformance/atomics/test_atomics.cpp b/test_conformance/atomics/test_atomics.cpp
index 34b34ed38a..c0c0136319 100644
--- a/test_conformance/atomics/test_atomics.cpp
+++ b/test_conformance/atomics/test_atomics.cpp
@@ -200,6 +200,10 @@ int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_q
         error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof( workSize ), &workSize, NULL );
         test_error( error, "Unable to obtain max work group size for device and kernel combo" );
 
+        // Limit workSize to avoid extremely large local buffer size and slow
+        // run.
+        if (workSize > 65536) workSize = 65536;
+
         // "workSize" is limited to that of the first dimension as only a 1DRange is executed.
         if( maxSizes[0] < workSize )
         {

From 1f26e1d8ba372f4c638f9c0cdae7566e349b9b9a Mon Sep 17 00:00:00 2001
From: Jeremy Kemp <jeremy@jeremykemp.co.uk>
Date: Tue, 7 Sep 2021 12:47:44 +0100
Subject: [PATCH 127/158] Fix memory model issue in `atomic_flag`. (#1283)

* Fix memory model issue in atomic_flag.

In atomic_flag sub-tests that modify local memory, compilers may re-order memory accesses between the local and global address spaces which can lead to incorrect test failures.

This commit ensures that both local and global memory operations are fenced to prevent this re-ordering from occurring.

Fixes #134.

* Clang format changes.

* Added missing global acquire which is necessary for the corresponding global release.

Thanks to @jlewis-austin for spotting.

* Clang format changes.

* Match the condition for applying acquire/release fences.
---
 test_conformance/c11_atomics/test_atomics.cpp | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp
index c3a190b73b..38b4e9a788 100644
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -1657,12 +1657,18 @@ class CBasicTestFlag : public CBasicTestMemOrderScope<HostAtomicType, HostDataTy
       "  for(cnt = 0; !stop && cnt < threadCount; cnt++) // each thread must find critical section where it is the first visitor\n"
       "  {\n"
       "    bool set = atomic_flag_test_and_set" + postfix + "(&destMemory[cnt]" + memoryOrderScope + ");\n";
-    if (MemoryOrder() == MEMORY_ORDER_RELAXED || MemoryOrder() == MEMORY_ORDER_RELEASE)
-      program += "    atomic_work_item_fence(" +
-                 std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") +
-                 "memory_order_acquire," +
-                 std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) +
-                 ");\n";
+    if (MemoryOrder() == MEMORY_ORDER_RELAXED
+        || MemoryOrder() == MEMORY_ORDER_RELEASE || LocalMemory())
+        program += "    atomic_work_item_fence("
+            + std::string(LocalMemory()
+                              ? "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, "
+                              : "CLK_GLOBAL_MEM_FENCE, ")
+            + "memory_order_acquire,"
+            + std::string(LocalMemory()
+                              ? "memory_scope_work_group"
+                              : (UseSVM() ? "memory_scope_all_svm_devices"
+                                          : "memory_scope_device"))
+            + ");\n";
 
     program +=
       "    if (!set)\n"
@@ -1683,12 +1689,18 @@ class CBasicTestFlag : public CBasicTestMemOrderScope<HostAtomicType, HostDataTy
       "        stop = 1;\n"
       "      }\n";
 
-    if (MemoryOrder() == MEMORY_ORDER_ACQUIRE || MemoryOrder() == MEMORY_ORDER_RELAXED)
-      program += "      atomic_work_item_fence(" +
-                 std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") +
-                 "memory_order_release," +
-                 std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) +
-                 ");\n";
+    if (MemoryOrder() == MEMORY_ORDER_ACQUIRE
+        || MemoryOrder() == MEMORY_ORDER_RELAXED || LocalMemory())
+        program += "      atomic_work_item_fence("
+            + std::string(LocalMemory()
+                              ? "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, "
+                              : "CLK_GLOBAL_MEM_FENCE, ")
+            + "memory_order_release,"
+            + std::string(LocalMemory()
+                              ? "memory_scope_work_group"
+                              : (UseSVM() ? "memory_scope_all_svm_devices"
+                                          : "memory_scope_device"))
+            + ");\n";
 
     program +=
       "      atomic_flag_clear" + postfix + "(&destMemory[cnt]" + MemoryOrderScopeStrForClear() + ");\n"

From 02bf24d2b1684b1ffde079d3598a8fc70610d4fc Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Mon, 13 Sep 2021 05:25:32 -0700
Subject: [PATCH 128/158] remove min max macros (#1310)

* remove the MIN and MAX macros and use the std versions instead

* fix formatting

* fix Arm build

* remove additional MIN and MAX macros from compat.h
---
 test_common/harness/compat.h                  |  7 ---
 test_common/harness/errorHelpers.cpp          | 20 +++----
 test_common/harness/imageHelpers.cpp          |  3 -
 .../basic/test_enqueued_local_size.cpp        |  8 ++-
 test_conformance/buffers/test_sub_buffers.cpp | 22 +++----
 .../conversions/test_conversions.cpp          |  7 ++-
 .../device_execution/enqueue_ndrange.cpp      |  3 +-
 .../device_execution/host_queue_order.cpp     |  3 +-
 test_conformance/half/Test_roundTrip.cpp      |  7 ++-
 test_conformance/half/Test_vLoadHalf.cpp      |  7 ++-
 test_conformance/half/Test_vStoreHalf.cpp     | 11 ++--
 .../images/kernel_read_write/test_common.cpp  | 25 ++++----
 .../kernel_read_write/test_iterations.cpp     | 32 ++++++----
 .../images/kernel_read_write/test_read_1D.cpp | 26 +++++---
 .../kernel_read_write/test_read_1D_array.cpp  | 27 ++++++---
 .../kernel_read_write/test_read_2D_array.cpp  | 32 ++++++----
 test_conformance/integer_ops/test_add_sat.cpp | 31 ++++------
 .../integer_ops/test_integers.cpp             | 60 +++++++++----------
 test_conformance/integer_ops/test_sub_sat.cpp | 32 ++++------
 .../integer_ops/test_unary_ops.cpp            |  2 +-
 .../math_brute_force/macro_binary_double.cpp  |  3 +-
 .../math_brute_force/macro_binary_float.cpp   |  3 +-
 .../math_brute_force/macro_unary_double.cpp   |  3 +-
 .../math_brute_force/macro_unary_float.cpp    |  4 +-
 test_conformance/math_brute_force/main.cpp    |  5 +-
 test_conformance/profiling/execute.cpp        | 12 ++--
 .../workgroups/test_wg_broadcast.cpp          |  6 +-
 .../workgroups/test_wg_scan_exclusive_max.cpp | 11 ++--
 .../workgroups/test_wg_scan_exclusive_min.cpp | 11 ++--
 .../workgroups/test_wg_scan_inclusive_max.cpp | 10 ++--
 .../workgroups/test_wg_scan_inclusive_min.cpp | 10 ++--
 31 files changed, 241 insertions(+), 202 deletions(-)

diff --git a/test_common/harness/compat.h b/test_common/harness/compat.h
index 3b55785269..4053b7ee72 100644
--- a/test_common/harness/compat.h
+++ b/test_common/harness/compat.h
@@ -309,13 +309,6 @@ EXTERN_C int __builtin_clz(unsigned int pattern);
 
 #endif
 
-#ifndef MIN
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#endif
-#ifndef MAX
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#endif
-
 
 /*-----------------------------------------------------------------------------
    WARNING: DO NOT USE THESE MACROS:
diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index ea928bc395..eaccf64119 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -18,6 +18,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <algorithm>
+
 #include "errorHelpers.h"
 
 #include "parseParameters.h"
@@ -301,10 +303,6 @@ const char *GetQueuePropertyName(cl_command_queue_properties property)
     }
 }
 
-#ifndef MAX
-#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b))
-#endif
-
 #if defined(_MSC_VER)
 #define scalbnf(_a, _i) ldexpf(_a, _i)
 #define scalbn(_a, _i) ldexp(_a, _i)
@@ -357,7 +355,7 @@ static float Ulp_Error_Half_Float(float test, double reference)
 
         // The unbiased exponent of the ulp unit place
         int ulp_exp =
-            HALF_MANT_DIG - 1 - MAX(ilogb(reference), HALF_MIN_EXP - 1);
+            HALF_MANT_DIG - 1 - std::max(ilogb(reference), HALF_MIN_EXP - 1);
 
         // Scale the exponent of the error
         return (float)scalbn(testVal - reference, ulp_exp);
@@ -365,7 +363,7 @@ static float Ulp_Error_Half_Float(float test, double reference)
 
     // reference is a normal power of two or a zero
     int ulp_exp =
-        HALF_MANT_DIG - 1 - MAX(ilogb(reference) - 1, HALF_MIN_EXP - 1);
+        HALF_MANT_DIG - 1 - std::max(ilogb(reference) - 1, HALF_MIN_EXP - 1);
 
     // Scale the exponent of the error
     return (float)scalbn(testVal - reference, ulp_exp);
@@ -437,7 +435,8 @@ float Ulp_Error(float test, double reference)
             return 0.0f; // if we are expecting a NaN, any NaN is fine
 
         // The unbiased exponent of the ulp unit place
-        int ulp_exp = FLT_MANT_DIG - 1 - MAX(ilogb(reference), FLT_MIN_EXP - 1);
+        int ulp_exp =
+            FLT_MANT_DIG - 1 - std::max(ilogb(reference), FLT_MIN_EXP - 1);
 
         // Scale the exponent of the error
         return (float)scalbn(testVal - reference, ulp_exp);
@@ -445,7 +444,8 @@ float Ulp_Error(float test, double reference)
 
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
-    int ulp_exp = FLT_MANT_DIG - 1 - MAX(ilogb(reference) - 1, FLT_MIN_EXP - 1);
+    int ulp_exp =
+        FLT_MANT_DIG - 1 - std::max(ilogb(reference) - 1, FLT_MIN_EXP - 1);
 
     // Scale the exponent of the error
     return (float)scalbn(testVal - reference, ulp_exp);
@@ -513,7 +513,7 @@ float Ulp_Error_Double(double test, long double reference)
 
         // The unbiased exponent of the ulp unit place
         int ulp_exp =
-            DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1);
+            DBL_MANT_DIG - 1 - std::max(ilogbl(reference), DBL_MIN_EXP - 1);
 
         // Scale the exponent of the error
         float result = (float)scalbnl(testVal - reference, ulp_exp);
@@ -529,7 +529,7 @@ float Ulp_Error_Double(double test, long double reference)
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
     int ulp_exp =
-        DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
+        DBL_MANT_DIG - 1 - std::max(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
 
     // Scale the exponent of the error
     float result = (float)scalbnl(testVal - reference, ulp_exp);
diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index 314709f82c..3a5c5533aa 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -690,9 +690,6 @@ int has_alpha(const cl_image_format *format)
         _b ^= _a;                                                              \
         _a ^= _b;                                                              \
     } while (0)
-#ifndef MAX
-#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b))
-#endif
 
 void get_max_sizes(
     size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3],
diff --git a/test_conformance/basic/test_enqueued_local_size.cpp b/test_conformance/basic/test_enqueued_local_size.cpp
index f52162a815..91fe1434e9 100644
--- a/test_conformance/basic/test_enqueued_local_size.cpp
+++ b/test_conformance/basic/test_enqueued_local_size.cpp
@@ -14,13 +14,15 @@
 // limitations under the License.
 //
 #include "harness/compat.h"
+#include "harness/rounding_mode.h"
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-#include "harness/rounding_mode.h"
+
+#include <algorithm>
 
 #include "procs.h"
 
@@ -124,8 +126,8 @@ test_enqueued_local_size(cl_device_id device, cl_context context, cl_command_que
     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_wgs), &max_wgs, NULL);
     test_error( err, "clGetDeviceInfo failed.");
 
-    localsize[0] = MIN(16, max_wgs);
-    localsize[1] = MIN(11, max_wgs / localsize[0]);
+    localsize[0] = std::min<size_t>(16, max_wgs);
+    localsize[1] = std::min<size_t>(11, max_wgs / localsize[0]);
     // If we need to use uniform workgroups because non-uniform workgroups are
     // not supported, round up to the next global size that is divisible by the
     // local size.
diff --git a/test_conformance/buffers/test_sub_buffers.cpp b/test_conformance/buffers/test_sub_buffers.cpp
index 691509fdcc..d6ab111e1d 100644
--- a/test_conformance/buffers/test_sub_buffers.cpp
+++ b/test_conformance/buffers/test_sub_buffers.cpp
@@ -15,6 +15,8 @@
 //
 #include "procs.h"
 
+#include <algorithm>
+
 // Design:
 // To test sub buffers, we first create one main buffer. We then create several sub-buffers and
 // queue Actions on each one. Each Action is encapsulated in a class so it can keep track of
@@ -101,13 +103,6 @@ class ReadWriteAction : public Action
     }
 };
 
-#ifndef MAX
-#define MAX( _a, _b )   ( (_a) > (_b) ? (_a) : (_b) )
-#endif
-#ifndef MIN
-#define MIN( _a, _b )   ( (_a) < (_b) ? (_a) : (_b) )
-#endif
-
 class CopyAction : public Action
 {
 public:
@@ -117,7 +112,8 @@ class CopyAction : public Action
     virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState )
     {
         // Copy from sub-buffer 1 to sub-buffer 2
-        size_t size = get_random_size_t( 0, MIN( buffer1.mSize, buffer2.mSize ), GetRandSeed() );
+        size_t size = get_random_size_t(
+            0, std::min(buffer1.mSize, buffer2.mSize), GetRandSeed());
 
         size_t startOffset = get_random_size_t( 0, buffer1.mSize - size, GetRandSeed() );
         size_t endOffset = get_random_size_t( 0, buffer2.mSize - size, GetRandSeed() );
@@ -266,7 +262,11 @@ int test_sub_buffers_read_write_core( cl_context context, cl_command_queue queue
             endRange = mainSize;
 
         size_t offset = get_random_size_t( toStartFrom / addressAlign, endRange / addressAlign, Action::GetRandSeed() ) * addressAlign;
-        size_t size = get_random_size_t( 1, ( MIN( mainSize / 8, mainSize - offset ) ) / addressAlign, Action::GetRandSeed() ) * addressAlign;
+        size_t size =
+            get_random_size_t(
+                1, (std::min(mainSize / 8, mainSize - offset)) / addressAlign,
+                Action::GetRandSeed())
+            * addressAlign;
         error = subBuffers[ numSubBuffers ].Allocate( mainBuffer, CL_MEM_READ_WRITE, offset, size );
         test_error( error, "Unable to allocate sub buffer" );
 
@@ -443,7 +443,7 @@ int test_sub_buffers_read_write_dual_devices( cl_device_id deviceID, cl_context
 
     error = get_reasonable_buffer_size( otherDevice, maxBuffer2 );
     test_error( error, "Unable to get buffer size for secondary device" );
-    maxBuffer1 = MIN( maxBuffer1, maxBuffer2 );
+    maxBuffer1 = std::min(maxBuffer1, maxBuffer2);
 
     cl_uint addressAlign1Bits, addressAlign2Bits;
     error = clGetDeviceInfo( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign1Bits ), &addressAlign1Bits, NULL );
@@ -452,7 +452,7 @@ int test_sub_buffers_read_write_dual_devices( cl_device_id deviceID, cl_context
     error = clGetDeviceInfo( otherDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign2Bits ), &addressAlign2Bits, NULL );
     test_error( error, "Unable to get secondary device's address alignment" );
 
-    cl_uint addressAlign1 = MAX( addressAlign1Bits, addressAlign2Bits ) / 8;
+    cl_uint addressAlign1 = std::max(addressAlign1Bits, addressAlign2Bits) / 8;
 
     // Finally time to run!
     return test_sub_buffers_read_write_core( testingContext, queue1, queue2, maxBuffer1, addressAlign1 );
diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index 87b8ead741..e8e572e667 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -47,6 +47,8 @@
 #endif
 #include <time.h>
 
+#include <algorithm>
+
 #include "Sleep.h"
 #include "basic_test_conversions.h"
 
@@ -1003,7 +1005,8 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
     uint64_t i;
 
     gTestCount++;
-    size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] );
+    size_t blockCount =
+        BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
     size_t step = blockCount;
     uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]);
     cl_event writeInputBuffer = NULL;
@@ -1078,7 +1081,7 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
             fflush(stdout);
         }
 
-        cl_uint count = (uint32_t) MIN( blockCount, lastCase - i );
+        cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
         writeInputBufferInfo.count = count;
 
         // Crate a user event to represent the status of the reference value computation completion
diff --git a/test_conformance/device_execution/enqueue_ndrange.cpp b/test_conformance/device_execution/enqueue_ndrange.cpp
index 8ced6629d8..f228f06373 100644
--- a/test_conformance/device_execution/enqueue_ndrange.cpp
+++ b/test_conformance/device_execution/enqueue_ndrange.cpp
@@ -18,6 +18,7 @@
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
 
+#include <algorithm>
 #include <vector>
 
 #include "procs.h"
@@ -645,7 +646,7 @@ int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_que
     max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
     if(gWimpyMode)
     {
-        max_local_size = MIN(8, max_local_size);
+        max_local_size = std::min((size_t)8, max_local_size);
     }
 
     cl_uint num = 10;
diff --git a/test_conformance/device_execution/host_queue_order.cpp b/test_conformance/device_execution/host_queue_order.cpp
index 2b5688d126..5376ea40f0 100644
--- a/test_conformance/device_execution/host_queue_order.cpp
+++ b/test_conformance/device_execution/host_queue_order.cpp
@@ -18,6 +18,7 @@
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
 
+#include <algorithm>
 #include <vector>
 
 #include "procs.h"
@@ -124,7 +125,7 @@ int test_host_queue_order(cl_device_id device, cl_context context, cl_command_qu
     cl_uint num = arr_size(result);
     if( gWimpyMode )
     {
-        num = MAX(num / 16, 4);
+        num = std::max(num / 16, 4U);
     }
 
     clMemWrapper res_mem;
diff --git a/test_conformance/half/Test_roundTrip.cpp b/test_conformance/half/Test_roundTrip.cpp
index 69fc7e4184..1ab4093763 100644
--- a/test_conformance/half/Test_roundTrip.cpp
+++ b/test_conformance/half/Test_roundTrip.cpp
@@ -14,6 +14,9 @@
 // limitations under the License.
 //
 #include <string.h>
+
+#include <algorithm>
+
 #include "cl_utils.h"
 #include "tests.h"
 #include "harness/testHarness.h"
@@ -156,7 +159,7 @@ int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue qu
     }
 
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
+    size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
     size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
     size_t stride = blockCount;
@@ -168,7 +171,7 @@ int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue qu
 
     for( i = 0; i < (uint64_t)lastCase; i += stride )
     {
-        count = (uint32_t) MIN( blockCount, lastCase - i );
+        count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
 
         //Init the input stream
         uint16_t *p = (uint16_t *)gIn_half;
diff --git a/test_conformance/half/Test_vLoadHalf.cpp b/test_conformance/half/Test_vLoadHalf.cpp
index 5dfac7a30e..e93540191d 100644
--- a/test_conformance/half/Test_vLoadHalf.cpp
+++ b/test_conformance/half/Test_vLoadHalf.cpp
@@ -17,6 +17,9 @@
 #include "harness/testHarness.h"
 
 #include <string.h>
+
+#include <algorithm>
+
 #include "cl_utils.h"
 #include "tests.h"
 
@@ -429,7 +432,7 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned )
     }
 
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
+    size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
     size_t blockCount = getBufferSize(device) / elementSize; // elementSize is power of 2
     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
 
@@ -447,7 +450,7 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned )
 
     for( i = 0; i < (uint64_t)lastCase; i += blockCount )
     {
-        count = (uint32_t) MIN( blockCount, lastCase - i );
+        count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
 
         //Init the input stream
         uint16_t *p = (uint16_t *)gIn_half;
diff --git a/test_conformance/half/Test_vStoreHalf.cpp b/test_conformance/half/Test_vStoreHalf.cpp
index c3a328ad64..85824a9fb9 100644
--- a/test_conformance/half/Test_vStoreHalf.cpp
+++ b/test_conformance/half/Test_vStoreHalf.cpp
@@ -18,6 +18,9 @@
 #include "harness/testHarness.h"
 
 #include <string.h>
+
+#include <algorithm>
+
 #include "cl_utils.h"
 #include "tests.h"
 
@@ -674,7 +677,7 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
     } // end for vector size
 
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
+    size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
     size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
     uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
     size_t stride = blockCount;
@@ -726,7 +729,7 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
 
     for( i = 0; i < lastCase; i += stride )
     {
-        count = (cl_uint) MIN( blockCount, lastCase - i );
+        count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
         fref.i = i;
         dref.i = i;
 
@@ -1272,7 +1275,7 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double
     }
 
     // Figure out how many elements are in a work block
-    size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
+    size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
     size_t blockCount = BUFFER_SIZE / elementSize;
     uint64_t lastCase = 1ULL << (8*sizeof(float));
     size_t stride = blockCount;
@@ -1323,7 +1326,7 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double
 
     for( i = 0; i < (uint64_t)lastCase; i += stride )
     {
-        count = (cl_uint) MIN( blockCount, lastCase - i );
+        count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
         fref.i = i;
         dref.i = i;
 
diff --git a/test_conformance/images/kernel_read_write/test_common.cpp b/test_conformance/images/kernel_read_write/test_common.cpp
index 375ee5877e..6b3cf849da 100644
--- a/test_conformance/images/kernel_read_write/test_common.cpp
+++ b/test_conformance/images/kernel_read_write/test_common.cpp
@@ -16,6 +16,7 @@
 
 #include "test_common.h"
 
+#include <algorithm>
 
 cl_sampler create_sampler(cl_context context, image_sampler_data *sdata, bool test_mipmaps, cl_int *error) {
     cl_sampler sampler = nullptr;
@@ -934,13 +935,13 @@ int test_read_image(cl_context context, cl_command_queue queue,
                                         {
                                             err4 = 0.0f;
                                         }
-                                        float maxErr1 = MAX(
+                                        float maxErr1 = std::max(
                                             maxErr * maxPixel.p[0], FLT_MIN);
-                                        float maxErr2 = MAX(
+                                        float maxErr2 = std::max(
                                             maxErr * maxPixel.p[1], FLT_MIN);
-                                        float maxErr3 = MAX(
+                                        float maxErr3 = std::max(
                                             maxErr * maxPixel.p[2], FLT_MIN);
-                                        float maxErr4 = MAX(
+                                        float maxErr4 = std::max(
                                             maxErr * maxPixel.p[3], FLT_MIN);
 
                                         if (!(err1 <= maxErr1)
@@ -1039,17 +1040,17 @@ int test_read_image(cl_context context, cl_command_queue queue,
                                             float err4 = ABS_ERROR(resultPtr[3],
                                                                    expected[3]);
                                             float maxErr1 =
-                                                MAX(maxErr * maxPixel.p[0],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[0],
+                                                         FLT_MIN);
                                             float maxErr2 =
-                                                MAX(maxErr * maxPixel.p[1],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[1],
+                                                         FLT_MIN);
                                             float maxErr3 =
-                                                MAX(maxErr * maxPixel.p[2],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[2],
+                                                         FLT_MIN);
                                             float maxErr4 =
-                                                MAX(maxErr * maxPixel.p[3],
-                                                    FLT_MIN);
+                                                std::max(maxErr * maxPixel.p[3],
+                                                         FLT_MIN);
 
 
                                             if (!(err1 <= maxErr1)
diff --git a/test_conformance/images/kernel_read_write/test_iterations.cpp b/test_conformance/images/kernel_read_write/test_iterations.cpp
index 03ca9595ff..3b779fab4f 100644
--- a/test_conformance/images/kernel_read_write/test_iterations.cpp
+++ b/test_conformance/images/kernel_read_write/test_iterations.cpp
@@ -16,6 +16,8 @@
 #include "test_common.h"
 #include <float.h>
 
+#include <algorithm>
+
 #if defined( __APPLE__ )
     #include <signal.h>
     #include <sys/signal.h>
@@ -434,7 +436,8 @@ int validate_image_2D_depth_results(void *imageValues, void *resultValues, doubl
                         float err1 = ABS_ERROR(resultPtr[0], expected[0]);
                         // Clamp to the minimum absolute error for the format
                         if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; }
-                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                        float maxErr1 =
+                            std::max(maxErr * maxPixel.p[0], FLT_MIN);
 
                         // Check if the result matches.
                         if( ! (err1 <= maxErr1) )
@@ -484,7 +487,8 @@ int validate_image_2D_depth_results(void *imageValues, void *resultValues, doubl
                                                                                     imageSampler, expected, 0, &containsDenormals );
 
                             float err1 = ABS_ERROR(resultPtr[0], expected[0]);
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
 
 
                             if( ! (err1 <= maxErr1) )
@@ -598,10 +602,14 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form
                         if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                         if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                         if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                        float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                        float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                        float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                        float maxErr1 =
+                            std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                        float maxErr2 =
+                            std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                        float maxErr3 =
+                            std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                        float maxErr4 =
+                            std::max(maxErr * maxPixel.p[3], FLT_MIN);
 
                         // Check if the result matches.
                         if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -671,10 +679,14 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form
                             float err2 = ABS_ERROR(resultPtr[1], expected[1]);
                             float err3 = ABS_ERROR(resultPtr[2], expected[2]);
                             float err4 = ABS_ERROR(resultPtr[3], expected[3]);
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                            float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                            float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                            float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                            float maxErr2 =
+                                std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                            float maxErr3 =
+                                std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                            float maxErr4 =
+                                std::max(maxErr * maxPixel.p[3], FLT_MIN);
 
 
                             if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
diff --git a/test_conformance/images/kernel_read_write/test_read_1D.cpp b/test_conformance/images/kernel_read_write/test_read_1D.cpp
index c9ba4e847b..68113f9a84 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D.cpp
@@ -17,6 +17,8 @@
 #include "test_common.h"
 #include <float.h>
 
+#include <algorithm>
+
 #if defined( __APPLE__ )
     #include <signal.h>
     #include <sys/signal.h>
@@ -669,10 +671,14 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke
                             if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                             if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                             if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                            float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                            float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                            float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                            float maxErr2 =
+                                std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                            float maxErr3 =
+                                std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                            float maxErr4 =
+                                std::max(maxErr * maxPixel.p[3], FLT_MIN);
 
                             // Check if the result matches.
                             if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -732,10 +738,14 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke
                                     ABS_ERROR(resultPtr[2], expected[2]);
                                 float err4 =
                                     ABS_ERROR(resultPtr[3], expected[3]);
-                                float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                float maxErr1 =
+                                    std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                                float maxErr2 =
+                                    std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                                float maxErr3 =
+                                    std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                                float maxErr4 =
+                                    std::max(maxErr * maxPixel.p[3], FLT_MIN);
 
 
                                 if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
diff --git a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
index b3287ded9c..ac266ad73d 100644
--- a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp
@@ -16,13 +16,14 @@
 #include "test_common.h"
 #include <float.h>
 
+#include <algorithm>
+
 #if defined( __APPLE__ )
 #include <signal.h>
 #include <sys/signal.h>
 #include <setjmp.h>
 #endif
 
-
 const char *read1DArrayKernelSourcePattern =
 "__kernel void sample_kernel( read_only image1d_array_t input,%s __global float *xOffsets, __global float *yOffsets, __global %s4 *results %s)\n"
 "{\n"
@@ -772,10 +773,14 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker
                             if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                             if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                             if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                            float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                            float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                            float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                            float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                            float maxErr1 =
+                                std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                            float maxErr2 =
+                                std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                            float maxErr3 =
+                                std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                            float maxErr4 =
+                                std::max(maxErr * maxPixel.p[3], FLT_MIN);
 
                             // Check if the result matches.
                             if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
@@ -838,10 +843,14 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker
                                     ABS_ERROR(resultPtr[2], expected[2]);
                                 float err4 =
                                     ABS_ERROR(resultPtr[3], expected[3]);
-                                float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                float maxErr1 =
+                                    std::max(maxErr * maxPixel.p[0], FLT_MIN);
+                                float maxErr2 =
+                                    std::max(maxErr * maxPixel.p[1], FLT_MIN);
+                                float maxErr3 =
+                                    std::max(maxErr * maxPixel.p[2], FLT_MIN);
+                                float maxErr4 =
+                                    std::max(maxErr * maxPixel.p[3], FLT_MIN);
 
 
                                 if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    ||
diff --git a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
index 7cb334b23f..11b7881462 100644
--- a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
+++ b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp
@@ -16,6 +16,8 @@
 #include "test_common.h"
 #include <float.h>
 
+#include <algorithm>
+
 // Utility function to clamp down image sizes for certain tests to avoid
 // using too much memory.
 static size_t reduceImageSizeRange(size_t maxDimSize) {
@@ -617,7 +619,8 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker
                                         ABS_ERROR(resultPtr[0], expected[0]);
                                     // Clamp to the minimum absolute error for the format
                                     if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; }
-                                    float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                                    float maxErr1 = std::max(
+                                        maxErr * maxPixel.p[0], FLT_MIN);
 
                                     if( ! (err1 <= maxErr1) )
                                     {
@@ -661,7 +664,8 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker
 
                                         float err1 = ABS_ERROR(resultPtr[0],
                                                                expected[0]);
-                                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
+                                        float maxErr1 = std::max(
+                                            maxErr * maxPixel.p[0], FLT_MIN);
 
 
                                         if( ! (err1 <= maxErr1) )
@@ -942,10 +946,14 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker
                                     if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; }
                                     if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; }
                                     if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; }
-                                    float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                    float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                    float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                    float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                    float maxErr1 = std::max(
+                                        maxErr * maxPixel.p[0], FLT_MIN);
+                                    float maxErr2 = std::max(
+                                        maxErr * maxPixel.p[1], FLT_MIN);
+                                    float maxErr3 = std::max(
+                                        maxErr * maxPixel.p[2], FLT_MIN);
+                                    float maxErr4 = std::max(
+                                        maxErr * maxPixel.p[3], FLT_MIN);
 
                                     if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) )
                                     {
@@ -1004,10 +1012,14 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker
                                                                expected[2]);
                                         float err4 = ABS_ERROR(resultPtr[3],
                                                                expected[3]);
-                                        float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN );
-                                        float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN );
-                                        float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN );
-                                        float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN );
+                                        float maxErr1 = std::max(
+                                            maxErr * maxPixel.p[0], FLT_MIN);
+                                        float maxErr2 = std::max(
+                                            maxErr * maxPixel.p[1], FLT_MIN);
+                                        float maxErr3 = std::max(
+                                            maxErr * maxPixel.p[2], FLT_MIN);
+                                        float maxErr4 = std::max(
+                                            maxErr * maxPixel.p[3], FLT_MIN);
 
 
                                         if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2)    || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) )
diff --git a/test_conformance/integer_ops/test_add_sat.cpp b/test_conformance/integer_ops/test_add_sat.cpp
index c0e45d114c..e33f5c672d 100644
--- a/test_conformance/integer_ops/test_add_sat.cpp
+++ b/test_conformance/integer_ops/test_add_sat.cpp
@@ -21,18 +21,9 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
-#include "procs.h"
-
-#define UCHAR_MIN   0
-#define USHRT_MIN   0
-#define UINT_MIN    0
+#include <algorithm>
 
-#ifndef MAX
-#define MAX( _a, _b )   ( (_a) > (_b) ? (_a) : (_b) )
-#endif
-#ifndef MIN
-#define MIN( _a, _b )   ( (_a) < (_b) ? (_a) : (_b) )
-#endif
+#include "procs.h"
 
 static int verify_addsat_char( const cl_char *inA, const cl_char *inB, const cl_char *outptr, int n, const char *sizeName, int vecSize )
 {
@@ -40,8 +31,8 @@ static int verify_addsat_char( const cl_char *inA, const cl_char *inB, const cl_
     for( i = 0; i < n; i++ )
     {
         cl_int r = (cl_int) inA[i] + (cl_int) inB[i];
-        r = MAX( r, CL_CHAR_MIN );
-        r = MIN( r, CL_CHAR_MAX );
+        r = std::max(r, CL_CHAR_MIN);
+        r = std::min(r, CL_CHAR_MAX);
 
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for add_sat( (char%s) 0x%2.2x, (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -55,9 +46,9 @@ static int verify_addsat_uchar( const cl_uchar *inA, const cl_uchar *inB, const
     for( i = 0; i < n; i++ )
     {
         cl_int r = (int) inA[i] + (int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_UCHAR_MAX );
-        if( r != outptr[i] )
+        r = std::max(r, 0);
+        r = std::min(r, CL_UCHAR_MAX);
+        if (r != outptr[i])
         { log_info( "\n%d) Failure for add_sat( (uchar%s) 0x%2.2x, (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
     }
     return 0;
@@ -69,8 +60,8 @@ static int verify_addsat_short( const cl_short *inA, const cl_short *inB, const
     for( i = 0; i < n; i++ )
     {
         cl_int r = (cl_int) inA[i] + (cl_int) inB[i];
-        r = MAX( r, CL_SHRT_MIN );
-        r = MIN( r, CL_SHRT_MAX );
+        r = std::max(r, CL_SHRT_MIN);
+        r = std::min(r, CL_SHRT_MAX);
 
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for add_sat( (short%s) 0x%4.4x, (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -84,8 +75,8 @@ static int verify_addsat_ushort( const cl_ushort *inA, const cl_ushort *inB, con
     for( i = 0; i < n; i++ )
     {
         cl_int r = (cl_int) inA[i] + (cl_int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_USHRT_MAX );
+        r = std::max(r, 0);
+        r = std::min(r, CL_USHRT_MAX);
 
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for add_sat( (ushort%s) 0x%4.4x, (ushort%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
diff --git a/test_conformance/integer_ops/test_integers.cpp b/test_conformance/integer_ops/test_integers.cpp
index 8d77b24bec..6fa18e1e0b 100644
--- a/test_conformance/integer_ops/test_integers.cpp
+++ b/test_conformance/integer_ops/test_integers.cpp
@@ -16,14 +16,9 @@
 #include "testBase.h"
 #include "harness/conversions.h"
 
-#define TEST_SIZE 512
+#include <algorithm>
 
-#ifndef MIN
-    #define MIN( _a, _b )   ((_a) < (_b) ? (_a) : (_b))
-#endif
-#ifndef MAX
-    #define MAX( _a, _b )   ((_a) > (_b) ? (_a) : (_b))
-#endif
+#define TEST_SIZE 512
 
 const char *singleParamIntegerKernelSourcePattern =
 "__kernel void sample_test(__global %s *sourceA, __global %s *destValues)\n"
@@ -1512,19 +1507,20 @@ bool verify_integer_clamp( void *sourceA, void *sourceB, void *sourceC, void *de
         switch( vecAType )
         {
             case kULong:
-                ((cl_ulong*) destination)[0] = MAX(MIN(valueA, valueC), valueB);
+                ((cl_ulong *)destination)[0] =
+                    std::max(std::min(valueA, valueC), valueB);
                 break;
             case kUInt:
-                ((cl_uint*) destination)[0] = (cl_uint)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_uint *)destination)[0] =
+                    (cl_uint)(std::max(std::min(valueA, valueC), valueB));
                 break;
             case kUShort:
-                ((cl_ushort*) destination)[0] = (cl_ushort)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_ushort *)destination)[0] =
+                    (cl_ushort)(std::max(std::min(valueA, valueC), valueB));
                 break;
             case kUChar:
-                ((cl_uchar*) destination)[0] = (cl_uchar)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_uchar *)destination)[0] =
+                    (cl_uchar)(std::max(std::min(valueA, valueC), valueB));
                 break;
             default:
                 //error -- should never get here
@@ -1576,19 +1572,20 @@ bool verify_integer_clamp( void *sourceA, void *sourceB, void *sourceC, void *de
         switch( vecAType )
         {
             case kLong:
-                ((cl_long*) destination)[0] = MAX(MIN(valueA, valueC), valueB);
+                ((cl_long *)destination)[0] =
+                    std::max(std::min(valueA, valueC), valueB);
                 break;
             case kInt:
-                ((cl_int*) destination)[0] = (cl_int)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_int *)destination)[0] =
+                    (cl_int)(std::max(std::min(valueA, valueC), valueB));
                 break;
             case kShort:
-                ((cl_short*) destination)[0] = (cl_short)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_short *)destination)[0] =
+                    (cl_short)(std::max(std::min(valueA, valueC), valueB));
                 break;
             case kChar:
-                ((cl_char*) destination)[0] = (cl_char)
-                    (MAX(MIN(valueA, valueC), valueB));
+                ((cl_char *)destination)[0] =
+                    (cl_char)(std::max(std::min(valueA, valueC), valueB));
                 break;
             default:
                 //error -- should never get here
@@ -1654,13 +1651,16 @@ bool verify_integer_mad_sat( void *sourceA, void *sourceB, void *sourceC, void *
                 ((cl_ulong*) destination)[0] = multLo;
                 break;
             case kUInt:
-                ((cl_uint*) destination)[0] = (cl_uint) MIN( multLo, (cl_ulong) CL_UINT_MAX );
+                ((cl_uint *)destination)[0] =
+                    (cl_uint)std::min(multLo, (cl_ulong)CL_UINT_MAX);
                 break;
             case kUShort:
-                ((cl_ushort*) destination)[0] = (cl_ushort) MIN( multLo, (cl_ulong) CL_USHRT_MAX );
+                ((cl_ushort *)destination)[0] =
+                    (cl_ushort)std::min(multLo, (cl_ulong)CL_USHRT_MAX);
                 break;
             case kUChar:
-                ((cl_uchar*) destination)[0] = (cl_uchar) MIN( multLo, (cl_ulong) CL_UCHAR_MAX );
+                ((cl_uchar *)destination)[0] =
+                    (cl_uchar)std::min(multLo, (cl_ulong)CL_UCHAR_MAX);
                 break;
             default:
                 //error -- should never get here
@@ -1744,18 +1744,18 @@ bool verify_integer_mad_sat( void *sourceA, void *sourceB, void *sourceC, void *
                 ((cl_long*) destination)[0] = result;
                 break;
             case kInt:
-                result = MIN( result, (cl_long) CL_INT_MAX );
-                result = MAX( result, (cl_long) CL_INT_MIN );
+                result = std::min(result, (cl_long)CL_INT_MAX);
+                result = std::max(result, (cl_long)CL_INT_MIN);
                 ((cl_int*) destination)[0] = (cl_int) result;
                 break;
             case kShort:
-                result = MIN( result, (cl_long) CL_SHRT_MAX );
-                result = MAX( result, (cl_long) CL_SHRT_MIN );
+                result = std::min(result, (cl_long)CL_SHRT_MAX);
+                result = std::max(result, (cl_long)CL_SHRT_MIN);
                 ((cl_short*) destination)[0] = (cl_short) result;
                 break;
             case kChar:
-                result = MIN( result, (cl_long) CL_CHAR_MAX );
-                result = MAX( result, (cl_long) CL_CHAR_MIN );
+                result = std::min(result, (cl_long)CL_CHAR_MAX);
+                result = std::max(result, (cl_long)CL_CHAR_MIN);
                 ((cl_char*) destination)[0] = (cl_char) result;
                 break;
             default:
diff --git a/test_conformance/integer_ops/test_sub_sat.cpp b/test_conformance/integer_ops/test_sub_sat.cpp
index 845d106402..2a88ee0df7 100644
--- a/test_conformance/integer_ops/test_sub_sat.cpp
+++ b/test_conformance/integer_ops/test_sub_sat.cpp
@@ -21,19 +21,9 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
-#include "procs.h"
-
-#define UCHAR_MIN   0
-#define USHRT_MIN   0
-#define UINT_MIN    0
-
-#ifndef MAX
-#define MAX( _a, _b )   ( (_a) > (_b) ? (_a) : (_b) )
-#endif
-#ifndef MIN
-#define MIN( _a, _b )   ( (_a) < (_b) ? (_a) : (_b) )
-#endif
+#include <algorithm>
 
+#include "procs.h"
 
 static int verify_subsat_char( const cl_char *inA, const cl_char *inB, const cl_char *outptr, int n, const char *sizeName, int vecSize )
 {
@@ -41,8 +31,8 @@ static int verify_subsat_char( const cl_char *inA, const cl_char *inB, const cl_
     for( i = 0; i < n; i++ )
     {
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, CL_CHAR_MIN );
-        r = MIN( r, CL_CHAR_MAX );
+        r = std::max(r, CL_CHAR_MIN);
+        r = std::min(r, CL_CHAR_MAX);
 
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for sub_sat( (char%s) 0x%2.2x, (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -56,9 +46,9 @@ static int verify_subsat_uchar( const cl_uchar *inA, const cl_uchar *inB, const
     for( i = 0; i < n; i++ )
     {
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_UCHAR_MAX );
-        if( r != outptr[i] )
+        r = std::max(r, 0);
+        r = std::min(r, CL_UCHAR_MAX);
+        if (r != outptr[i])
         { log_info( "\n%d) Failure for sub_sat( (uchar%s) 0x%2.2x, (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
     }
     return 0;
@@ -70,8 +60,8 @@ static int verify_subsat_short( const cl_short *inA, const cl_short *inB, const
     for( i = 0; i < n; i++ )
     {
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, CL_SHRT_MIN );
-        r = MIN( r, CL_SHRT_MAX );
+        r = std::max(r, CL_SHRT_MIN);
+        r = std::min(r, CL_SHRT_MAX);
 
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for sub_sat( (short%s) 0x%4.4x, (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
@@ -85,8 +75,8 @@ static int verify_subsat_ushort( const cl_ushort *inA, const cl_ushort *inB, con
     for( i = 0; i < n; i++ )
     {
         cl_int r = (cl_int) inA[i] - (cl_int) inB[i];
-        r = MAX( r, 0 );
-        r = MIN( r, CL_USHRT_MAX );
+        r = std::max(r, 0);
+        r = std::min(r, CL_USHRT_MAX);
 
         if( r != outptr[i] )
         { log_info( "\n%d) Failure for sub_sat( (ushort%s) 0x%4.4x, (ushort%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
diff --git a/test_conformance/integer_ops/test_unary_ops.cpp b/test_conformance/integer_ops/test_unary_ops.cpp
index 72940eaa83..c91c85aeb4 100644
--- a/test_conformance/integer_ops/test_unary_ops.cpp
+++ b/test_conformance/integer_ops/test_unary_ops.cpp
@@ -107,7 +107,7 @@ int test_unary_op( cl_command_queue queue, cl_context context, OpKonstants which
             // For sub ops, the min control value is 2. Otherwise, it's 0
             controlData[ i ] |= 0x02;
         else if( whichOp == kIncrement )
-            // For addition ops, the MAX control value is 1. Otherwise, it's 3
+            // For addition ops, the max control value is 1. Otherwise, it's 3
             controlData[ i ] &= ~0x02;
     }
     streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 6db6aa5680..d3e8071fb3 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -496,7 +496,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
 
 
-        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = std::max(1U, gMinVectorSizeIndex);
+             k < gMaxVectorSizeIndex; k++)
         {
             q = (cl_long *)out[k];
             // If we aren't getting the correctly rounded result
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index d6d5c8eb98..6c7c8c05e7 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -485,7 +485,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             goto exit;
         }
 
-        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = std::max(1U, gMinVectorSizeIndex);
+             k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
index 1978c185db..7f3521c6f2 100644
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -304,7 +304,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         }
 
 
-        for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        for (auto k = std::max(1U, gMinVectorSizeIndex);
+             k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
index ece5e9b6af..0cd54de466 100644
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -309,8 +309,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             }
 
 
-            for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex;
-                 k++)
+            for (auto k = std::max(1U, gMinVectorSizeIndex);
+                 k < gMaxVectorSizeIndex; k++)
             {
                 q = out[k];
                 // If we aren't getting the correctly rounded result
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 6691f4626c..1a6e0c4e1c 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -18,6 +18,7 @@
 #include "sleep.h"
 #include "utility.h"
 
+#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
@@ -1239,7 +1240,7 @@ float Bruteforce_Ulp_Error_Double(double test, long double reference)
 
         // The unbiased exponent of the ulp unit place
         int ulp_exp =
-            DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1);
+            DBL_MANT_DIG - 1 - std::max(ilogbl(reference), DBL_MIN_EXP - 1);
 
         // Scale the exponent of the error
         float result = (float)scalbnl(testVal - reference, ulp_exp);
@@ -1255,7 +1256,7 @@ float Bruteforce_Ulp_Error_Double(double test, long double reference)
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
     int ulp_exp =
-        DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
+        DBL_MANT_DIG - 1 - std::max(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
 
     // allow correctly rounded results to pass through unmolested. (We might add
     // error to it below.) There is something of a performance optimization here
diff --git a/test_conformance/profiling/execute.cpp b/test_conformance/profiling/execute.cpp
index edfc043c06..0541bfa52a 100644
--- a/test_conformance/profiling/execute.cpp
+++ b/test_conformance/profiling/execute.cpp
@@ -21,6 +21,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+
 #include "procs.h"
 #include "harness/testHarness.h"
 #include "harness/errorHelpers.h"
@@ -29,12 +31,6 @@
 typedef unsigned char uchar;
 #endif
 
-#undef MIN
-#define MIN(x,y)    ( (x) < (y) ? (x) : (y) )
-
-#undef MAX
-#define MAX(x,y)    ( (x) > (y) ? (x) : (y) )
-
 //#define CREATE_OUTPUT    1
 
 extern int writePPM( const char *filename, uchar *buf, int xsize, int ysize );
@@ -73,8 +69,8 @@ static const char *image_filter_src =
 static void read_imagef( int x, int y, int w, int h, int nChannels, uchar *src, float *srcRgb )
 {
     // clamp the coords
-    int    x0 = MIN( MAX( x, 0 ), w - 1 );
-    int    y0 = MIN( MAX( y, 0 ), h - 1 );
+    int x0 = std::min(std::max(x, 0), w - 1);
+    int y0 = std::min(std::max(y, 0), h - 1);
 
     // get tine index
     int    indx = ( y0 * w + x0 ) * nChannels;
diff --git a/test_conformance/workgroups/test_wg_broadcast.cpp b/test_conformance/workgroups/test_wg_broadcast.cpp
index 35559476ba..29380211a4 100644
--- a/test_conformance/workgroups/test_wg_broadcast.cpp
+++ b/test_conformance/workgroups/test_wg_broadcast.cpp
@@ -20,6 +20,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+
 #include "procs.h"
 
 
@@ -310,7 +312,7 @@ test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command
         localsize[0] = localsize[1] = 1;
     }
 
-    num_workgroups = MAX(n_elems/wg_size[0], 16);
+    num_workgroups = std::max(n_elems / wg_size[0], (size_t)16);
     globalsize[0] = num_workgroups * localsize[0];
     globalsize[1] = num_workgroups * localsize[1];
     num_elements = globalsize[0] * globalsize[1];
@@ -437,7 +439,7 @@ test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command
         localsize[0] = localsize[1] = localsize[2] = 1;
     }
 
-    num_workgroups = MAX(n_elems/wg_size[0], 8);
+    num_workgroups = std::max(n_elems / wg_size[0], (size_t)8);
     globalsize[0] = num_workgroups * localsize[0];
     globalsize[1] = num_workgroups * localsize[1];
     globalsize[2] = num_workgroups * localsize[2];
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
index 12338b685e..644b3ccf31 100644
--- a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
+++ b/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp
@@ -20,8 +20,9 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
-#include "procs.h"
+#include <algorithm>
 
+#include "procs.h"
 
 const char *wg_scan_exclusive_max_kernel_code_int =
 "__kernel void test_wg_scan_exclusive_max_int(global int *input, global int *output)\n"
@@ -79,7 +80,7 @@ verify_wg_scan_exclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_si
                 log_info("work_group_scan_exclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]);
                 return -1;
             }
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
         }
     }
 
@@ -103,7 +104,7 @@ verify_wg_scan_exclusive_max_uint(unsigned int *inptr, unsigned int *outptr, siz
                 log_info("work_group_scan_exclusive_max int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), max_, outptr[j+i]);
                 return -1;
             }
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
         }
     }
 
@@ -127,7 +128,7 @@ verify_wg_scan_exclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, siz
                 log_info("work_group_scan_exclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]);
                 return -1;
             }
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
         }
     }
 
@@ -151,7 +152,7 @@ verify_wg_scan_exclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n,
                 log_info("work_group_scan_exclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]);
                 return -1;
             }
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
         }
     }
 
diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
index f4e6bf9772..3c6dfc8755 100644
--- a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
+++ b/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp
@@ -20,8 +20,9 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
-#include "procs.h"
+#include <algorithm>
 
+#include "procs.h"
 
 const char *wg_scan_exclusive_min_kernel_code_int =
 "__kernel void test_wg_scan_exclusive_min_int(global int *input, global int *output)\n"
@@ -80,7 +81,7 @@ verify_wg_scan_exclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_si
                 log_info("work_group_scan_exclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]);
                 return -1;
             }
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
         }
     }
 
@@ -104,7 +105,7 @@ verify_wg_scan_exclusive_min_uint(unsigned int *inptr, unsigned int *outptr, siz
                 log_info("work_group_scan_exclusive_min int: Error at %u: expected = %u, got = %u\n", j+i, min_, outptr[j+i]);
                 return -1;
             }
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
         }
     }
 
@@ -128,7 +129,7 @@ verify_wg_scan_exclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, siz
                 log_info("work_group_scan_exclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]);
                 return -1;
             }
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
         }
     }
 
@@ -152,7 +153,7 @@ verify_wg_scan_exclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n,
                 log_info("work_group_scan_exclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]);
                 return -1;
             }
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
         }
     }
 
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
index 44ebf8059e..2a2e230e23 100644
--- a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
+++ b/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp
@@ -20,6 +20,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+
 #include "procs.h"
 
 
@@ -75,7 +77,7 @@ verify_wg_scan_inclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_si
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
             if (outptr[j+i] != max_) {
                 log_info("work_group_scan_inclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]);
                 return -1;
@@ -99,7 +101,7 @@ verify_wg_scan_inclusive_max_uint(unsigned int *inptr, unsigned int *outptr, siz
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
             if (outptr[j+i] != max_) {
                 log_info("work_group_scan_inclusive_max int: Error at %lu: expected = %u, got = %u\n", (unsigned long)(j+i), max_, outptr[j+i]);
                 return -1;
@@ -123,7 +125,7 @@ verify_wg_scan_inclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, siz
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
             if (outptr[j+i] != max_) {
                 log_info("work_group_scan_inclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]);
                 return -1;
@@ -147,7 +149,7 @@ verify_wg_scan_inclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n,
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            max_ = MAX(inptr[j+i], max_);
+            max_ = std::max(inptr[j + i], max_);
             if (outptr[j+i] != max_) {
                 log_info("work_group_scan_inclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]);
                 return -1;
diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
index f2f05788f5..adbdad56f1 100644
--- a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
+++ b/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp
@@ -20,6 +20,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <algorithm>
+
 #include "procs.h"
 
 
@@ -75,7 +77,7 @@ verify_wg_scan_inclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_si
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
             if (outptr[j+i] != min_) {
                 log_info("work_group_scan_inclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]);
                 return -1;
@@ -99,7 +101,7 @@ verify_wg_scan_inclusive_min_uint(unsigned int *inptr, unsigned int *outptr, siz
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
             if (outptr[j+i] != min_) {
                 log_info("work_group_scan_inclusive_min int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), min_, outptr[j+i]);
                 return -1;
@@ -123,7 +125,7 @@ verify_wg_scan_inclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, siz
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
             if (outptr[j+i] != min_) {
                 log_info("work_group_scan_inclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]);
                 return -1;
@@ -147,7 +149,7 @@ verify_wg_scan_inclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n,
             m = wg_size;
 
         for (i = 0; i < m; ++i) {
-            min_ = MIN(inptr[j+i], min_);
+            min_ = std::min(inptr[j + i], min_);
             if (outptr[j+i] != min_) {
                 log_info("work_group_scan_inclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]);
                 return -1;

From ddca0f802bee72ff9ea90b1dab28dddc51ef9a20 Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Tue, 28 Sep 2021 11:19:17 -0600
Subject: [PATCH 129/158] gles: Fix double frees. (#1323)

* gles: Fix double frees.

Remove a few explicit frees in the redirect_buffers test which are
already handled by a wrapper.

* gles: Fix double frees

A recent update to the object wrapper classes (#1268) changed the
behavior of assigning to a wrapper, whereby the wrapped object is now
released upon assignment. A couple of tests were manually calling
clReleaseMemObject and then assigning `nullptr` to the wrapper,
resulting in the wrapper calling clReleaseMemObject on an object that
had already been destroyed.

Co-authored-by: spauls <spauls@qti.qualcomm.com>
---
 test_conformance/gles/test_buffers.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/test_conformance/gles/test_buffers.cpp b/test_conformance/gles/test_buffers.cpp
index a2d67322a9..73711261a5 100644
--- a/test_conformance/gles/test_buffers.cpp
+++ b/test_conformance/gles/test_buffers.cpp
@@ -205,10 +205,10 @@ int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType
   if (validate_only) {
     int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER, (GLuint)inGLBuffer, (GLenum)0, 0) |
                   CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER, (GLuint)outGLBuffer, (GLenum)0, 0) );
-    for(i=0;i<3;i++)
+
+    for (i = 0; i < 3; i++)
     {
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
+        streams[i].reset();
     }
 
     glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;
@@ -285,10 +285,9 @@ int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType
         clP += get_explicit_type_size( vecType );
     }
 
-    for(i=0;i<3;i++)
+    for (i = 0; i < 3; i++)
     {
-        clReleaseMemObject(streams[i]);
-        streams[i] = NULL;
+        streams[i].reset();
     }
 
     glDeleteBuffers(1, &inGLBuffer);    inGLBuffer = 0;

From 4fb5deeec1e38bfa796b1cc0e93294ba1983b473 Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Tue, 28 Sep 2021 11:19:40 -0600
Subject: [PATCH 130/158] api: Enable cl_khr_fp16 when using half types in
 kernel (#1327)

---
 test_conformance/api/test_kernel_arg_info.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/test_conformance/api/test_kernel_arg_info.cpp b/test_conformance/api/test_kernel_arg_info.cpp
index 8073e0defe..dddb4a2328 100644
--- a/test_conformance/api/test_kernel_arg_info.cpp
+++ b/test_conformance/api/test_kernel_arg_info.cpp
@@ -167,7 +167,8 @@ static std::string generate_argument(const KernelArgInfo& kernel_arg)
 /* This function generates a kernel source and allows for multiple arguments to
  * be passed in and subsequently queried. */
 static std::string generate_kernel(const std::vector<KernelArgInfo>& all_args,
-                                   const bool supports_3d_image_writes = false)
+                                   const bool supports_3d_image_writes = false,
+                                   const bool kernel_uses_half_type = false)
 {
 
     std::string ret;
@@ -175,6 +176,10 @@ static std::string generate_kernel(const std::vector<KernelArgInfo>& all_args,
     {
         ret += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable\n";
     }
+    if (kernel_uses_half_type)
+    {
+        ret += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+    }
     ret += "kernel void get_kernel_arg_info(\n";
     for (int i = 0; i < all_args.size(); ++i)
     {
@@ -673,8 +678,8 @@ static int run_scalar_vector_tests(cl_context context, cl_device_id deviceID)
                     if (param_size + total_param_size >= max_param_size
                         || all_args.size() == MAX_NUMBER_OF_KERNEL_ARGS)
                     {
-                        const std::string kernel_src =
-                            generate_kernel(all_args);
+                        const std::string kernel_src = generate_kernel(
+                            all_args, false, device_supports_half(deviceID));
                         failed_tests += compare_kernel_with_expected(
                             context, deviceID, kernel_src.c_str(),
                             expected_args);
@@ -696,7 +701,8 @@ static int run_scalar_vector_tests(cl_context context, cl_device_id deviceID)
             }
         }
     }
-    const std::string kernel_src = generate_kernel(all_args);
+    const std::string kernel_src =
+        generate_kernel(all_args, false, device_supports_half(deviceID));
     failed_tests += compare_kernel_with_expected(
         context, deviceID, kernel_src.c_str(), expected_args);
     return failed_tests;

From 2b770c4f348d9ad71a22c3b949a1cffe32e9d1f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Wed, 29 Sep 2021 12:38:42 +0100
Subject: [PATCH 131/158] Update cl_khr_integer_dot_product tests for v2
 (#1317)

* Update cl_khr_integer_dot_product tests for v2

Signed-off-by: Kevin Petit <kevin.petit@arm.com>
Signed-off-by: Marco Cattani <marco.cattani@arm.com>
Change-Id: I97dbd820f1f32f6b377e47d0bf638f36bb91930a

* only query acceleration properties with v2+

Change-Id: I3f13a0cba7f1f686365b10adf81690e089cd3d74
---
 test_common/harness/deviceInfo.cpp            | 34 ++++++++++
 test_common/harness/deviceInfo.h              |  5 ++
 .../integer_ops/test_integer_dot_product.cpp  | 67 +++++++++++++++++++
 3 files changed, 106 insertions(+)

diff --git a/test_common/harness/deviceInfo.cpp b/test_common/harness/deviceInfo.cpp
index 287a142303..97ab8c8553 100644
--- a/test_common/harness/deviceInfo.cpp
+++ b/test_common/harness/deviceInfo.cpp
@@ -63,6 +63,40 @@ int is_extension_available(cl_device_id device, const char *extensionName)
     return false;
 }
 
+cl_version get_extension_version(cl_device_id device, const char *extensionName)
+{
+    cl_int err;
+    size_t size;
+
+    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS_WITH_VERSION, 0, nullptr,
+                          &size);
+    if (err != CL_SUCCESS)
+    {
+        throw std::runtime_error("clGetDeviceInfo(CL_DEVICE_EXTENSIONS_WITH_"
+                                 "VERSION) failed to return size\n");
+    }
+
+    std::vector<cl_name_version> extensions(size / sizeof(cl_name_version));
+    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS_WITH_VERSION, size,
+                          extensions.data(), &size);
+    if (err != CL_SUCCESS)
+    {
+        throw std::runtime_error("clGetDeviceInfo(CL_DEVICE_EXTENSIONS_WITH_"
+                                 "VERSION) failed to return value\n");
+    }
+
+    for (auto &ext : extensions)
+    {
+        if (!strcmp(extensionName, ext.name))
+        {
+            return ext.version;
+        }
+    }
+
+    throw std::runtime_error("Extension " + std::string(extensionName)
+                             + " not supported by device!");
+}
+
 /* Returns a string containing the supported extensions list for a device. */
 std::string get_device_extensions_string(cl_device_id device)
 {
diff --git a/test_common/harness/deviceInfo.h b/test_common/harness/deviceInfo.h
index f8c55805cd..912dd198ac 100644
--- a/test_common/harness/deviceInfo.h
+++ b/test_common/harness/deviceInfo.h
@@ -31,6 +31,11 @@ std::string get_device_info_string(cl_device_id device,
 /* Determines if an extension is supported by a device. */
 int is_extension_available(cl_device_id device, const char *extensionName);
 
+/* Returns the version of the extension the device supports or throws an
+ * exception if the extension is not supported by the device. */
+cl_version get_extension_version(cl_device_id device,
+                                 const char *extensionName);
+
 /* Returns a string containing the supported extensions list for a device. */
 std::string get_device_extensions_string(cl_device_id device);
 
diff --git a/test_conformance/integer_ops/test_integer_dot_product.cpp b/test_conformance/integer_ops/test_integer_dot_product.cpp
index be25b320ab..602d59b628 100644
--- a/test_conformance/integer_ops/test_integer_dot_product.cpp
+++ b/test_conformance/integer_ops/test_integer_dot_product.cpp
@@ -336,6 +336,21 @@ int test_integer_dot_product(cl_device_id deviceID, cl_context context,
         return TEST_SKIPPED_ITSELF;
     }
 
+    Version deviceVersion = get_device_cl_version(deviceID);
+    cl_version extensionVersion;
+
+    if ((deviceVersion >= Version(3, 0))
+        || is_extension_available(deviceID, "cl_khr_extended_versioning"))
+    {
+        extensionVersion =
+            get_extension_version(deviceID, "cl_khr_integer_dot_product");
+    }
+    else
+    {
+        // Assume 1.0.0 is supported if the version can't be queried
+        extensionVersion = CL_MAKE_VERSION(1, 0, 0);
+    }
+
     cl_int error = CL_SUCCESS;
     int result = TEST_PASS;
 
@@ -346,12 +361,63 @@ int test_integer_dot_product(cl_device_id deviceID, cl_context context,
     test_error(
         error,
         "Unable to query CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR");
+
+    // Check that the required capabilities are reported
     test_assert_error(
         dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR,
         "When cl_khr_integer_dot_product is supported "
         "CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR must be "
         "supported");
 
+    if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0))
+    {
+        test_assert_error(
+            dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR,
+            "When cl_khr_integer_dot_product is supported with version >= 2.0.0"
+            "CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR must be "
+            "supported");
+    }
+
+    // Check that acceleration properties can be queried
+    if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0))
+    {
+        size_t size_ret;
+        error = clGetDeviceInfo(
+            deviceID,
+            CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, 0,
+            nullptr, &size_ret);
+        test_error(
+            error,
+            "Unable to query size of data returned by "
+            "CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR");
+
+        cl_device_integer_dot_product_acceleration_properties_khr
+            accelerationProperties;
+        error = clGetDeviceInfo(
+            deviceID,
+            CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR,
+            sizeof(accelerationProperties), &accelerationProperties, nullptr);
+        test_error(error, "Unable to query 8-bit acceleration properties");
+
+        error = clGetDeviceInfo(
+            deviceID,
+            CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR,
+            0, nullptr, &size_ret);
+        test_error(
+            error,
+            "Unable to query size of data returned by "
+            "CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_"
+            "PACKED_KHR");
+
+        error = clGetDeviceInfo(
+            deviceID,
+            CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR,
+            sizeof(accelerationProperties), &accelerationProperties, nullptr);
+        test_error(error,
+                   "Unable to query 4x8-bit packed acceleration properties");
+    }
+
+    // Report when unknown capabilities are found
     if (dotCaps
         & ~(CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR
             | CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR))
@@ -359,6 +425,7 @@ int test_integer_dot_product(cl_device_id deviceID, cl_context context,
         log_info("NOTE: found an unknown / untested capability!\n");
     }
 
+    // Test built-in functions
     if (dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR)
     {
         result |= test_vectype<cl_uchar, cl_uint, 4>(deviceID, context, queue,

From 903f1bf65dfe15956295eb9379f5706568d858a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20Petit?= <kpet@free.fr>
Date: Thu, 30 Sep 2021 13:33:18 +0100
Subject: [PATCH 132/158] Report unsupported extended subgroup tests as skipped
 rather than passed (#1301)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Report unsupported extended subgroup tests as skipped rather than passed

Also don't check the presence of extensions for each sub-test.

Signed-off-by: Kévin Petit <kpet@free.fr>

* address review comments
---
 test_conformance/subgroups/subhelpers.h         | 17 +----------------
 .../subgroups/test_subgroup_ballot.cpp          | 10 +++++++---
 .../test_subgroup_clustered_reduce.cpp          | 12 +++++++-----
 .../subgroups/test_subgroup_extended_types.cpp  | 12 +++++++-----
 .../test_subgroup_non_uniform_arithmetic.cpp    | 15 +++++++++------
 .../test_subgroup_non_uniform_vote.cpp          | 13 +++++++------
 .../subgroups/test_subgroup_shuffle.cpp         | 10 +++++++---
 .../test_subgroup_shuffle_relative.cpp          | 12 +++++++-----
 8 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index 9232cdedc0..0d497fb325 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -33,10 +33,9 @@ extern cl_half_rounding_mode g_rounding_mode;
 struct WorkGroupParams
 {
     WorkGroupParams(size_t gws, size_t lws,
-                    const std::vector<std::string> &req_ext = {},
                     const std::vector<uint32_t> &all_wim = {})
         : global_workgroup_size(gws), local_workgroup_size(lws),
-          required_extensions(req_ext), all_work_item_masks(all_wim)
+          all_work_item_masks(all_wim)
     {
         subgroup_size = 0;
         work_items_mask = 0;
@@ -49,7 +48,6 @@ struct WorkGroupParams
     uint32_t work_items_mask;
     int dynsc;
     bool use_core_subgroups;
-    std::vector<std::string> required_extensions;
     std::vector<uint32_t> all_work_item_masks;
 };
 
@@ -1297,19 +1295,6 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
             }
         }
 
-        for (std::string extension : test_params.required_extensions)
-        {
-            if (!is_extension_available(device, extension.c_str()))
-            {
-                log_info("The extension %s not supported on this device. SKIP "
-                         "testing - kernel %s data type %s\n",
-                         extension.c_str(), kname, TypeManager<Ty>::name());
-                return TEST_PASS;
-            }
-            kernel_sstr << "#pragma OPENCL EXTENSION " + extension
-                    + ": enable\n";
-        }
-
         error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
                                 (void *)&platform, NULL);
         test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index f2e4060b0b..9a2da5d9c7 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -926,11 +926,15 @@ template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
 int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
                                    cl_command_queue queue, int num_elements)
 {
-    std::vector<std::string> required_extensions = { "cl_khr_subgroup_ballot" };
+    if (!is_extension_available(device, "cl_khr_subgroup_ballot"))
+    {
+        log_info("cl_khr_subgroup_ballot is not supported on this device, "
+                 "skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 170;
     constexpr size_t local_work_size = 64;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     // non uniform broadcast functions
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
index 588e9cee18..87507e3792 100644
--- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -305,13 +305,15 @@ int test_subgroup_functions_clustered_reduce(cl_device_id device,
                                              cl_command_queue queue,
                                              int num_elements)
 {
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_clustered_reduce"
-    };
+    if (!is_extension_available(device, "cl_khr_subgroup_clustered_reduce"))
+    {
+        log_info("cl_khr_subgroup_clustered_reduce is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp
index 98401b8ef0..b281f6187c 100644
--- a/test_conformance/subgroups/test_subgroup_extended_types.cpp
+++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp
@@ -59,13 +59,15 @@ int test_subgroup_functions_extended_types(cl_device_id device,
                                            cl_command_queue queue,
                                            int num_elements)
 {
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_extended_types"
-    };
+    if (!is_extension_available(device, "cl_khr_subgroup_extended_types"))
+    {
+        log_info("cl_khr_subgroup_extended_types is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_broadcast_for_extended_type<cl_uint2>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
index eb46ff092c..6c44249edb 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -434,17 +434,20 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
                                                    cl_command_queue queue,
                                                    int num_elements)
 {
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_non_uniform_arithmetic"
-    };
+    if (!is_extension_available(device,
+                                "cl_khr_subgroup_non_uniform_arithmetic"))
+    {
+        log_info("cl_khr_subgroup_non_uniform_arithmetic is not supported on "
+                 "this device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
                                  0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
                                  0x00ffff00, 0x80000000, 0xaaaaaaaa };
 
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions, masks);
+    WorkGroupParams test_params(global_work_size, local_work_size, masks);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
@@ -470,4 +473,4 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
 
     error |= run_functions_logical_and_or_xor_for_type<cl_int>(rft);
     return error;
-}
\ No newline at end of file
+}
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
index 2b00b4dd27..484e9b6b49 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -272,17 +272,18 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
                                              cl_command_queue queue,
                                              int num_elements)
 {
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_non_uniform_vote"
-    };
-
+    if (!is_extension_available(device, "cl_khr_subgroup_non_uniform_vote"))
+    {
+        log_info("cl_khr_subgroup_non_uniform_vote is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
                                  0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
                                  0x00ffff00, 0x80000000 };
     constexpr size_t global_work_size = 170;
     constexpr size_t local_work_size = 64;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions, masks);
+    WorkGroupParams test_params(global_work_size, local_work_size, masks);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_vote_all_equal_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp
index 049f09824b..37b27cedb5 100644
--- a/test_conformance/subgroups/test_subgroup_shuffle.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp
@@ -55,11 +55,15 @@ template <typename T> int run_shuffle_for_type(RunTestForType rft)
 int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
                                     cl_command_queue queue, int num_elements)
 {
-    std::vector<std::string> required_extensions{ "cl_khr_subgroup_shuffle" };
+    if (!is_extension_available(device, "cl_khr_subgroup_shuffle"))
+    {
+        log_info("cl_khr_subgroup_shuffle is not supported on this device, "
+                 "skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_shuffle_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
index 6000c9702a..11401e80bc 100644
--- a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
@@ -56,13 +56,15 @@ int test_subgroup_functions_shuffle_relative(cl_device_id device,
                                              cl_command_queue queue,
                                              int num_elements)
 {
-    std::vector<std::string> required_extensions = {
-        "cl_khr_subgroup_shuffle_relative"
-    };
+    if (!is_extension_available(device, "cl_khr_subgroup_shuffle_relative"))
+    {
+        log_info("cl_khr_subgroup_shuffle_relative is not supported on this "
+                 "device, skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size,
-                                required_extensions);
+    WorkGroupParams test_params(global_work_size, local_work_size);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_shuffle_relative_for_type<cl_int>(rft);

From 92844bead1afdf75b56085c2cda34be27458a582 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Fri, 1 Oct 2021 12:28:37 +0200
Subject: [PATCH 133/158] Extended subgroups - use 128bit masks (#1215)

* Extended subgroups - use 128bit masks

* Refactoring to avoid kernels code duplication

* unification kernel names as test_ prefix +subgroups function name
* use string literals that improve readability
* use kernel templates that limit code duplication
* WorkGroupParams allows define default kernel - kernel template for multiple functions
* WorkGroupParams allows define  kernel for specific one subgroup function

Co-authored-by: Stuart Brady <stuart.brady@arm.com>
---
 .../subgroups/subgroup_common_kernels.cpp     | 104 +----
 .../subgroups/subgroup_common_kernels.h       |  12 +-
 .../subgroups/subgroup_common_templates.h     |  98 ++--
 test_conformance/subgroups/subhelpers.h       | 181 +++++++-
 test_conformance/subgroups/test_subgroup.cpp  |  47 +-
 .../subgroups/test_subgroup_ballot.cpp        | 425 +++++++-----------
 .../test_subgroup_clustered_reduce.cpp        | 176 +-------
 .../test_subgroup_extended_types.cpp          |  44 +-
 .../test_subgroup_non_uniform_arithmetic.cpp  | 409 +++--------------
 .../test_subgroup_non_uniform_vote.cpp        |  93 ++--
 .../subgroups/test_subgroup_shuffle.cpp       |  29 +-
 .../test_subgroup_shuffle_relative.cpp        |  28 +-
 12 files changed, 592 insertions(+), 1054 deletions(-)

diff --git a/test_conformance/subgroups/subgroup_common_kernels.cpp b/test_conformance/subgroups/subgroup_common_kernels.cpp
index f8b244504a..33a51637d6 100644
--- a/test_conformance/subgroups/subgroup_common_kernels.cpp
+++ b/test_conformance/subgroups/subgroup_common_kernels.cpp
@@ -15,92 +15,20 @@
 //
 #include "subgroup_common_kernels.h"
 
-const char* bcast_source =
-    "__kernel void test_bcast(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint which_sub_group_local_id = xy[gid].z;\n"
-    "    out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
 
-    "}\n";
-
-const char* redadd_source = "__kernel void test_redadd(const __global Type "
-                            "*in, __global int4 *xy, __global Type *out)\n"
-                            "{\n"
-                            "    int gid = get_global_id(0);\n"
-                            "    XY(xy,gid);\n"
-                            "    out[gid] = sub_group_reduce_add(in[gid]);\n"
-                            "}\n";
-
-const char* redmax_source = "__kernel void test_redmax(const __global Type "
-                            "*in, __global int4 *xy, __global Type *out)\n"
-                            "{\n"
-                            "    int gid = get_global_id(0);\n"
-                            "    XY(xy,gid);\n"
-                            "    out[gid] = sub_group_reduce_max(in[gid]);\n"
-                            "}\n";
-
-const char* redmin_source = "__kernel void test_redmin(const __global Type "
-                            "*in, __global int4 *xy, __global Type *out)\n"
-                            "{\n"
-                            "    int gid = get_global_id(0);\n"
-                            "    XY(xy,gid);\n"
-                            "    out[gid] = sub_group_reduce_min(in[gid]);\n"
-                            "}\n";
-
-const char* scinadd_source =
-    "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
-    "}\n";
-
-const char* scinmax_source =
-    "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
-    "}\n";
-
-const char* scinmin_source =
-    "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
-    "}\n";
-
-const char* scexadd_source =
-    "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
-    "}\n";
-
-const char* scexmax_source =
-    "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
-    "}\n";
-
-const char* scexmin_source =
-    "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
-    "}\n";
+std::string sub_group_reduction_scan_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        out[gid] = %s(in[gid]);
+    }
+)";
+
+std::string sub_group_generic_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        Type x = in[gid];
+        out[gid] = %s(x, xy[gid].z);
+    }
+)";
\ No newline at end of file
diff --git a/test_conformance/subgroups/subgroup_common_kernels.h b/test_conformance/subgroups/subgroup_common_kernels.h
index 8ae97d9a36..bf2210ef3d 100644
--- a/test_conformance/subgroups/subgroup_common_kernels.h
+++ b/test_conformance/subgroups/subgroup_common_kernels.h
@@ -18,15 +18,7 @@
 #include "subhelpers.h"
 
 
-extern const char* bcast_source;
-extern const char* redadd_source;
-extern const char* redmax_source;
-extern const char* redmin_source;
-extern const char* scinadd_source;
-extern const char* scinmax_source;
-extern const char* scinmin_source;
-extern const char* scexadd_source;
-extern const char* scexmax_source;
-extern const char* scexmin_source;
+extern std::string sub_group_reduction_scan_source;
+extern std::string sub_group_generic_source;
 
 #endif
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index 4333e95b6c..5c5f9560ac 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -17,13 +17,10 @@
 #define SUBGROUPCOMMONTEMPLATES_H
 
 #include "typeWrappers.h"
-#include <bitset>
 #include "CL/cl_half.h"
 #include "subhelpers.h"
-
 #include <set>
 
-typedef std::bitset<128> bs128;
 static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
                                   const std::string &mask_type,
                                   cl_uint max_sub_group_size)
@@ -577,16 +574,21 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         ng = ng / nw;
         std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
-                        : func_name = "sub_group_scan_exclusive";
+        test_params.work_items_mask.any()
+            ? func_name = "sub_group_non_uniform_scan_exclusive"
+            : func_name = "sub_group_scan_exclusive";
         log_info("  %s_%s(%s)...\n", func_name.c_str(),
                  operation_names(operation), TypeManager<Ty>::name());
         log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x \n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+                 "size = %d \n",
+                 test_params.global_workgroup_size, nw, ns);
+        if (test_params.work_items_mask.any())
+        {
+            log_info("               work items mask: %s\n",
+                     test_params.work_items_mask.to_string().c_str());
+        }
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
@@ -597,18 +599,22 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
+        bs128 work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         Ty tr, rr;
         ng = ng / nw;
 
         std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
-                        : func_name = "sub_group_scan_exclusive";
+        test_params.work_items_mask.any()
+            ? func_name = "sub_group_non_uniform_scan_exclusive"
+            : func_name = "sub_group_scan_exclusive";
+
 
-        uint32_t use_work_items_mask;
         // for uniform case take into consideration all workitems
-        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        if (!work_items_mask.any())
+        {
+            work_items_mask.set();
+        }
         for (k = 0; k < ng; ++k)
         { // for each work_group
             // Map to array indexed to array indexed by local ID and sub group
@@ -624,8 +630,7 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
                 std::set<int> active_work_items;
                 for (i = 0; i < n; ++i)
                 {
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (use_work_items_mask & check_work_item)
+                    if (work_items_mask.test(i))
                     {
                         active_work_items.insert(i);
                     }
@@ -688,18 +693,23 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         ng = ng / nw;
         std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
-                        : func_name = "sub_group_scan_inclusive";
+        test_params.work_items_mask.any()
+            ? func_name = "sub_group_non_uniform_scan_inclusive"
+            : func_name = "sub_group_scan_inclusive";
 
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
         log_info("  %s_%s(%s)...\n", func_name.c_str(),
                  operation_names(operation), TypeManager<Ty>::name());
         log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x \n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+                 "size = %d \n",
+                 test_params.global_workgroup_size, nw, ns);
+        if (test_params.work_items_mask.any())
+        {
+            log_info("               work items mask: %s\n",
+                     test_params.work_items_mask.to_string().c_str());
+        }
     }
 
     static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -709,18 +719,22 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
+        bs128 work_items_mask = test_params.work_items_mask;
+
         int nj = (nw + ns - 1) / ns;
         Ty tr, rr;
         ng = ng / nw;
 
         std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
-                        : func_name = "sub_group_scan_inclusive";
+        work_items_mask.any()
+            ? func_name = "sub_group_non_uniform_scan_inclusive"
+            : func_name = "sub_group_scan_inclusive";
 
-        uint32_t use_work_items_mask;
         // for uniform case take into consideration all workitems
-        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        if (!work_items_mask.any())
+        {
+            work_items_mask.set();
+        }
         // std::bitset<32> mask32(use_work_items_mask);
         // for (int k) mask32.count();
         for (k = 0; k < ng; ++k)
@@ -740,8 +754,7 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
 
                 for (i = 0; i < n; ++i)
                 {
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (use_work_items_mask & check_work_item)
+                    if (work_items_mask.test(i))
                     {
                         if (catch_frist_active == -1)
                         {
@@ -807,17 +820,22 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         ng = ng / nw;
         std::string func_name;
 
-        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
-                        : func_name = "sub_group_reduce";
+        test_params.work_items_mask.any()
+            ? func_name = "sub_group_non_uniform_reduce"
+            : func_name = "sub_group_reduce";
         log_info("  %s_%s(%s)...\n", func_name.c_str(),
                  operation_names(operation), TypeManager<Ty>::name());
         log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x \n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+                 "size = %d \n",
+                 test_params.global_workgroup_size, nw, ns);
+        if (test_params.work_items_mask.any())
+        {
+            log_info("               work items mask: %s\n",
+                     test_params.work_items_mask.to_string().c_str());
+        }
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
@@ -828,14 +846,14 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
+        bs128 work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         ng = ng / nw;
         Ty tr, rr;
 
         std::string func_name;
-        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
-                        : func_name = "sub_group_reduce";
+        work_items_mask.any() ? func_name = "sub_group_non_uniform_reduce"
+                              : func_name = "sub_group_reduce";
 
         for (k = 0; k < ng; ++k)
         {
@@ -847,9 +865,10 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
                 my[j] = y[j];
             }
 
-            uint32_t use_work_items_mask;
-            use_work_items_mask =
-                !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+            if (!work_items_mask.any())
+            {
+                work_items_mask.set();
+            }
 
             for (j = 0; j < nj; ++j)
             {
@@ -859,8 +878,7 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
                 int catch_frist_active = -1;
                 for (i = 0; i < n; ++i)
                 {
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (use_work_items_mask & check_work_item)
+                    if (work_items_mask.test(i))
                     {
                         if (catch_frist_active == -1)
                         {
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index 0d497fb325..6d32928aed 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -24,31 +24,172 @@
 #include <limits>
 #include <vector>
 #include <type_traits>
+#include <bitset>
+#include <regex>
+#include <map>
 
 #define NR_OF_ACTIVE_WORK_ITEMS 4
 
 extern MTdata gMTdata;
+typedef std::bitset<128> bs128;
 extern cl_half_rounding_mode g_rounding_mode;
 
 struct WorkGroupParams
 {
     WorkGroupParams(size_t gws, size_t lws,
-                    const std::vector<uint32_t> &all_wim = {})
+                    bool use_mask = false)
         : global_workgroup_size(gws), local_workgroup_size(lws),
-          all_work_item_masks(all_wim)
+          use_masks(use_mask)
     {
         subgroup_size = 0;
         work_items_mask = 0;
         use_core_subgroups = true;
         dynsc = 0;
+        load_masks();
     }
     size_t global_workgroup_size;
     size_t local_workgroup_size;
     size_t subgroup_size;
-    uint32_t work_items_mask;
+    bs128 work_items_mask;
     int dynsc;
     bool use_core_subgroups;
-    std::vector<uint32_t> all_work_item_masks;
+    std::vector<bs128> all_work_item_masks;
+    bool use_masks;
+    void save_kernel_source(const std::string &source, std::string name = "")
+    {
+        if (name == "")
+        {
+            name = "default";
+        }
+        if (kernel_function_name.find(name) != kernel_function_name.end())
+        {
+            log_info("Kernel definition duplication. Source will be "
+                     "overwritten for function name %s",
+                     name.c_str());
+        }
+        kernel_function_name[name] = source;
+    };
+    // return specific defined kernel or default.
+    std::string get_kernel_source(std::string name)
+    {
+        if (kernel_function_name.find(name) == kernel_function_name.end())
+        {
+            return kernel_function_name["default"];
+        }
+        return kernel_function_name[name];
+    }
+
+
+private:
+    std::map<std::string, std::string> kernel_function_name;
+    void load_masks()
+    {
+        if (use_masks)
+        {
+            // 1 in string will be set 1, 0 will be set 0
+            bs128 mask_0xf0f0f0f0("11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xf0f0f0f0);
+            // 1 in string will be set 0, 0 will be set 1
+            bs128 mask_0x0f0f0f0f("11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000"
+                                  "11110000111100001111000011110000",
+                                  128, '1', '0');
+            all_work_item_masks.push_back(mask_0x0f0f0f0f);
+            bs128 mask_0x5555aaaa("10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x5555aaaa);
+            bs128 mask_0xaaaa5555("10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010"
+                                  "10101010101010101010101010101010",
+                                  128, '1', '0');
+            all_work_item_masks.push_back(mask_0xaaaa5555);
+            // 0x0f0ff0f0
+            bs128 mask_0x0f0ff0f0("00001111000011111111000011110000"
+                                  "00001111000011111111000011110000"
+                                  "00001111000011111111000011110000"
+                                  "00001111000011111111000011110000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x0f0ff0f0);
+            // 0xff0000ff
+            bs128 mask_0xff0000ff("11111111000000000000000011111111"
+                                  "11111111000000000000000011111111"
+                                  "11111111000000000000000011111111"
+                                  "11111111000000000000000011111111",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xff0000ff);
+            // 0xff00ff00
+            bs128 mask_0xff00ff00("11111111000000001111111100000000"
+                                  "11111111000000001111111100000000"
+                                  "11111111000000001111111100000000"
+                                  "11111111000000001111111100000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xff00ff00);
+            // 0x00ffff00
+            bs128 mask_0x00ffff00("00000000111111111111111100000000"
+                                  "00000000111111111111111100000000"
+                                  "00000000111111111111111100000000"
+                                  "00000000111111111111111100000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x00ffff00);
+            // 0x80 1 workitem highest id for 8 subgroup size
+            bs128 mask_0x80808080("10000000100000001000000010000000"
+                                  "10000000100000001000000010000000"
+                                  "10000000100000001000000010000000"
+                                  "10000000100000001000000010000000",
+                                  128, '0', '1');
+
+            all_work_item_masks.push_back(mask_0x80808080);
+            // 0x8000 1 workitem highest id for 16 subgroup size
+            bs128 mask_0x80008000("10000000000000001000000000000000"
+                                  "10000000000000001000000000000000"
+                                  "10000000000000001000000000000000"
+                                  "10000000000000001000000000000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x80008000);
+            // 0x80000000 1 workitem highest id for 32 subgroup size
+            bs128 mask_0x80000000("10000000000000000000000000000000"
+                                  "10000000000000000000000000000000"
+                                  "10000000000000000000000000000000"
+                                  "10000000000000000000000000000000",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0x80000000);
+            // 0x80000000 00000000 1 workitem highest id for 64 subgroup size
+            // 0x80000000 1 workitem highest id for 32 subgroup size
+            bs128 mask_0x8000000000000000("10000000000000000000000000000000"
+                                          "00000000000000000000000000000000"
+                                          "10000000000000000000000000000000"
+                                          "00000000000000000000000000000000",
+                                          128, '0', '1');
+
+            all_work_item_masks.push_back(mask_0x8000000000000000);
+            // 0x80000000 00000000 00000000 00000000 1 workitem highest id for
+            // 128 subgroup size
+            bs128 mask_0x80000000000000000000000000000000(
+                "10000000000000000000000000000000"
+                "00000000000000000000000000000000"
+                "00000000000000000000000000000000"
+                "00000000000000000000000000000000",
+                128, '0', '1');
+            all_work_item_masks.push_back(
+                mask_0x80000000000000000000000000000000);
+
+            bs128 mask_0xffffffff("11111111111111111111111111111111"
+                                  "11111111111111111111111111111111"
+                                  "11111111111111111111111111111111"
+                                  "11111111111111111111111111111111",
+                                  128, '0', '1');
+            all_work_item_masks.push_back(mask_0xffffffff);
+        }
+    }
 };
 
 enum class SubgroupsBroadcastOp
@@ -1267,11 +1408,23 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
         std::vector<Ty> mapout;
         mapout.resize(local);
         std::stringstream kernel_sstr;
-        if (test_params.work_items_mask != 0)
+        if (test_params.use_masks)
         {
-            kernel_sstr << "#define WORK_ITEMS_MASK ";
-            kernel_sstr << "0x" << std::hex << test_params.work_items_mask
-                        << "\n";
+            // Prapare uint4 type to store bitmask on kernel OpenCL C side
+            // To keep order the first characet in string is the lowest bit
+            // there was a need to give such offset to bitset constructor
+            // (first highest offset = 96)
+            std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(),
+                                      96, 32);
+            std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(),
+                                       64, 32);
+            std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(),
+                                       32, 32);
+            std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(),
+                                        0, 32);
+            kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b"
+                        << bits_1_32 << ",0b" << bits_33_64 << ",0b"
+                        << bits_65_96 << ",0b" << bits_97_128 << ");\n";
         }
 
 
@@ -1452,18 +1605,24 @@ struct RunTestForType
           num_elements_(num_elements), test_params_(test_params)
     {}
     template <typename T, typename U>
-    int run_impl(const char *kernel_name, const char *source)
+    int run_impl(const std::string &function_name)
     {
         int error = TEST_PASS;
+        std::string source =
+            std::regex_replace(test_params_.get_kernel_source(function_name),
+                               std::regex("\\%s"), function_name);
+        std::string kernel_name = "test_" + function_name;
         if (test_params_.all_work_item_masks.size() > 0)
         {
             error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
-                                     kernel_name, source, test_params_);
+                                     kernel_name.c_str(), source.c_str(),
+                                     test_params_);
         }
         else
         {
             error = test<T, U>::run(device_, context_, queue_, num_elements_,
-                                    kernel_name, source, test_params_);
+                                    kernel_name.c_str(), source.c_str(),
+                                    test_params_);
         }
 
         return error;
diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp
index c0e4952408..63bfc4532c 100644
--- a/test_conformance/subgroups/test_subgroup.cpp
+++ b/test_conformance/subgroups/test_subgroup.cpp
@@ -150,25 +150,25 @@ template <typename T>
 int run_broadcast_scan_reduction_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
-        "test_bcast", bcast_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
-                                                            redadd_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
-                                                            redmax_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
-                                                            redmin_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
-                                                             scinadd_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
-                                                             scinmax_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
-                                                             scinmin_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
-                                                             scexadd_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
-                                                             scexmax_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
-                                                             scexmin_source);
+        "sub_group_broadcast");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_inclusive_add");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_inclusive_max");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_inclusive_min");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_exclusive_add");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_exclusive_max");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_exclusive_min");
     return error;
 }
 
@@ -181,11 +181,14 @@ int test_subgroup_functions(cl_device_id device, cl_context context,
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
     WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_reduction_scan_source);
+    test_params.save_kernel_source(sub_group_generic_source,
+                                   "sub_group_broadcast");
+
     RunTestForType rft(device, context, queue, num_elements, test_params);
     int error =
-        rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
-    error |=
-        rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
+        rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("sub_group_any");
+    error |= rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("sub_group_all");
     error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
     error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
     error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index 9a2da5d9c7..2bd54e4355 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -684,239 +684,127 @@ template <typename Ty, BallotOp operation> struct SMASK
     }
 };
 
-static const char *bcast_non_uniform_source =
-    "__kernel void test_bcast_non_uniform(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
-    "        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n"
-    "    } else {\n"
-    "       out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n"
-    "    }\n"
-    "}\n";
-
-static const char *bcast_first_source =
-    "__kernel void test_bcast_first(const __global Type *in, __global int4 "
-    "*xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
-    "       out[gid] = sub_group_broadcast_first(x);\n"
-    "    } else {\n"
-    "       out[gid] = sub_group_broadcast_first(x);\n"
-    "    }\n"
-    "}\n";
-
-static const char *ballot_bit_count_source =
-    "__kernel void test_sub_group_ballot_bit_count(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-
-static const char *ballot_inclusive_scan_source =
-    "__kernel void test_sub_group_ballot_inclusive_scan(const __global Type "
-    "*in, __global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-
-static const char *ballot_exclusive_scan_source =
-    "__kernel void test_sub_group_ballot_exclusive_scan(const __global Type "
-    "*in, __global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-
-static const char *ballot_find_lsb_source =
-    "__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);\n"
-    "    value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n"
-    "    out[gid] = value;\n"
-    "}\n";
-
-static const char *ballot_find_msb_source =
-    "__kernel void test_sub_group_ballot_find_msb(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(0,0,0,0);"
-    "    value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);"
-    "    out[gid] = value ;"
-    "}\n";
-
-static const char *get_subgroup_ge_mask_source =
-    "__kernel void test_get_sub_group_ge_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_ge_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-
-static const char *get_subgroup_gt_mask_source =
-    "__kernel void test_get_sub_group_gt_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_gt_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-
-static const char *get_subgroup_le_mask_source =
-    "__kernel void test_get_sub_group_le_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_le_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-
-static const char *get_subgroup_lt_mask_source =
-    "__kernel void test_get_sub_group_lt_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_lt_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-
-static const char *get_subgroup_eq_mask_source =
-    "__kernel void test_get_sub_group_eq_mask(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].z = get_max_sub_group_size();\n"
-    "    Type x = in[gid];\n"
-    "    uint4 mask = get_sub_group_eq_mask();"
-    "    out[gid] = mask;\n"
-    "}\n";
-
-static const char *ballot_source =
-    "__kernel void test_sub_group_ballot(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "uint4 full_ballot = sub_group_ballot(1);\n"
-    "uint divergence_mask;\n"
-    "uint4 partial_ballot;\n"
-    "uint gid = get_global_id(0);"
-    "XY(xy,gid);\n"
-    "if (get_sub_group_local_id() & 1) {\n"
-    "    divergence_mask = 0xaaaaaaaa;\n"
-    "    partial_ballot = sub_group_ballot(1);\n"
-    "} else {\n"
-    "    divergence_mask = 0x55555555;\n"
-    "    partial_ballot = sub_group_ballot(1);\n"
-    "}\n"
-    " size_t lws = get_local_size(0);\n"
-    "uint4 masked_ballot = full_ballot;\n"
-    "masked_ballot.x &= divergence_mask;\n"
-    "masked_ballot.y &= divergence_mask;\n"
-    "masked_ballot.z &= divergence_mask;\n"
-    "masked_ballot.w &= divergence_mask;\n"
-    "out[gid] = all(masked_ballot == partial_ballot);\n"
-
-    "} \n";
-
-static const char *ballot_source_inverse =
-    "__kernel void test_sub_group_ballot_inverse(const __global "
-    "Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint4 value = (uint4)(10,0,0,0);\n"
-    "    if (get_sub_group_local_id() & 1) {"
-    "        uint4 partial_ballot_mask = "
-    "(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);"
-    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
-    "            value = (uint4)(1,0,0,1);\n"
-    "        } else {\n"
-    "            value = (uint4)(0,0,0,1);\n"
-    "        }\n"
-    "    } else {\n"
-    "       uint4 partial_ballot_mask = "
-    "(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);"
-    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
-    "            value = (uint4)(1,0,0,2);\n"
-    "        } else {\n"
-    "            value = (uint4)(0,0,0,2);\n"
-    "        }\n"
-    "    }\n"
-    "    out[gid] = value;\n"
-    "}\n";
+std::string sub_group_non_uniform_broadcast_source = R"(
+__kernel void test_sub_group_non_uniform_broadcast(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
+        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);
+    } else {
+        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);
+    }
+}
+)";
+std::string sub_group_broadcast_first_source = R"(
+__kernel void test_sub_group_broadcast_first(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
+        out[gid] = sub_group_broadcast_first(x);;
+    } else {
+        out[gid] = sub_group_broadcast_first(x);;
+    }
+}
+)";
+std::string sub_group_ballot_bit_scan_find_source = R"(
+__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    uint4 value = (uint4)(0,0,0,0);
+    value = (uint4)(%s(x),0,0,0);
+    out[gid] = value;
+}
+)";
+std::string sub_group_ballot_mask_source = R"(
+__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    xy[gid].z = get_max_sub_group_size();
+    Type x = in[gid];
+    uint4 mask = %s();
+    out[gid] = mask;
+}
+)";
+std::string sub_group_ballot_source = R"(
+__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
+    uint4 full_ballot = sub_group_ballot(1);
+    uint divergence_mask;
+    uint4 partial_ballot;
+    uint gid = get_global_id(0);
+    XY(xy,gid);
+    if (get_sub_group_local_id() & 1) {
+        divergence_mask = 0xaaaaaaaa;
+        partial_ballot = sub_group_ballot(1);
+    } else {
+        divergence_mask = 0x55555555;
+        partial_ballot = sub_group_ballot(1);
+    }
+     size_t lws = get_local_size(0);
+    uint4 masked_ballot = full_ballot;
+    masked_ballot.x &= divergence_mask;
+    masked_ballot.y &= divergence_mask;
+    masked_ballot.z &= divergence_mask;
+    masked_ballot.w &= divergence_mask;
+    out[gid] = all(masked_ballot == partial_ballot);
 
-static const char *ballot_bit_extract_source =
-    "__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    uint index = xy[gid].z;\n"
-    "    uint4 value = (uint4)(10,0,0,0);\n"
-    "    if (get_sub_group_local_id() & 1) {"
-    "       if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n"
-    "           value = (uint4)(1,0,0,1);\n"
-    "       } else {\n"
-    "           value = (uint4)(0,0,0,1);\n"
-    "       }\n"
-    "    } else {\n"
-    "       if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n"
-    "           value = (uint4)(1,0,0,2);\n"
-    "       } else {\n"
-    "           value = (uint4)(0,0,0,2);\n"
-    "       }\n"
-    "    }\n"
-    "    out[gid] = value;\n"
-    "}\n";
+}
+)";
+std::string sub_group_inverse_ballot_source = R"(
+__kernel void test_sub_group_inverse_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    uint4 value = (uint4)(10,0,0,0);
+    if (get_sub_group_local_id() & 1) {
+        uint4 partial_ballot_mask = (uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);
+        if (sub_group_inverse_ballot(partial_ballot_mask)) {
+            value = (uint4)(1,0,0,1);
+        } else {
+            value = (uint4)(0,0,0,1);
+        }
+    } else {
+        uint4 partial_ballot_mask = (uint4)(0x55555555,0x55555555,0x55555555,0x55555555);
+        if (sub_group_inverse_ballot(partial_ballot_mask)) {
+            value = (uint4)(1,0,0,2);
+        } else {
+            value = (uint4)(0,0,0,2);
+        }
+    }
+    out[gid] = value;
+}
+)";
+std::string sub_group_ballot_bit_extract_source = R"(
+ __kernel void test_sub_group_ballot_bit_extract(const __global Type *in, __global int4 *xy, __global Type *out) {
+    int gid = get_global_id(0);
+    XY(xy,gid);
+    Type x = in[gid];
+    uint index = xy[gid].z;
+    uint4 value = (uint4)(10,0,0,0);
+    if (get_sub_group_local_id() & 1) {
+        if (sub_group_ballot_bit_extract(x, xy[gid].z)) {
+            value = (uint4)(1,0,0,1);
+        } else {
+            value = (uint4)(0,0,0,1);
+        }
+    } else {
+        if (sub_group_ballot_bit_extract(x, xy[gid].w)) {
+            value = (uint4)(1,0,0,2);
+        } else {
+            value = (uint4)(0,0,0,2);
+        }
+    }
+    out[gid] = value;
+}
+)";
 
 template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
 {
     int error =
         rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
-            "test_bcast_non_uniform", bcast_non_uniform_source);
+            "sub_group_non_uniform_broadcast");
     return error;
 }
 
@@ -932,9 +820,15 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
                  "skipping test.\n");
         return TEST_SKIPPED_ITSELF;
     }
+
     constexpr size_t global_work_size = 170;
     constexpr size_t local_work_size = 64;
     WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_ballot_mask_source);
+    test_params.save_kernel_source(sub_group_non_uniform_broadcast_source,
+                                   "sub_group_non_uniform_broadcast");
+    test_params.save_kernel_source(sub_group_broadcast_first_source,
+                                   "sub_group_broadcast_first");
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     // non uniform broadcast functions
@@ -1018,76 +912,87 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
     // broadcast first functions
     error |=
         rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
-            "test_bcast_first", bcast_first_source);
+            "sub_group_broadcast_first");
     error |= rft.run_impl<cl_uint,
                           BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_long,
                           BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_ulong,
                           BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_short,
                           BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_ushort,
                           BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_char,
                           BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_uchar,
                           BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_float,
                           BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<cl_double,
                           BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
     error |= rft.run_impl<
         subgroups::cl_half,
         BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
-        "test_bcast_first", bcast_first_source);
+        "sub_group_broadcast_first");
 
     // mask functions
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
-        "test_get_sub_group_eq_mask", get_subgroup_eq_mask_source);
+        "get_sub_group_eq_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
-        "test_get_sub_group_ge_mask", get_subgroup_ge_mask_source);
+        "get_sub_group_ge_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
-        "test_get_sub_group_gt_mask", get_subgroup_gt_mask_source);
+        "get_sub_group_gt_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
-        "test_get_sub_group_le_mask", get_subgroup_le_mask_source);
+        "get_sub_group_le_mask");
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
-        "test_get_sub_group_lt_mask", get_subgroup_lt_mask_source);
+        "get_sub_group_lt_mask");
 
     // ballot functions
-    error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot",
-                                                    ballot_source);
-    error |= rft.run_impl<cl_uint4,
-                          BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
-        "test_sub_group_ballot_inverse", ballot_source_inverse);
-    error |= rft.run_impl<
+    WorkGroupParams test_params_ballot(global_work_size, local_work_size);
+    test_params_ballot.save_kernel_source(
+        sub_group_ballot_bit_scan_find_source);
+    test_params_ballot.save_kernel_source(sub_group_ballot_source,
+                                          "sub_group_ballot");
+    test_params_ballot.save_kernel_source(sub_group_inverse_ballot_source,
+                                          "sub_group_inverse_ballot");
+    test_params_ballot.save_kernel_source(sub_group_ballot_bit_extract_source,
+                                          "sub_group_ballot_bit_extract");
+    RunTestForType rft_ballot(device, context, queue, num_elements,
+                              test_params_ballot);
+    error |= rft_ballot.run_impl<cl_uint, BALLOT<cl_uint>>("sub_group_ballot");
+    error |=
+        rft_ballot.run_impl<cl_uint4,
+                            BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
+            "sub_group_inverse_ballot");
+    error |= rft_ballot.run_impl<
         cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
-        "test_sub_group_ballot_bit_extract", ballot_bit_extract_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_bit_extract");
+    error |= rft_ballot.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
-        "test_sub_group_ballot_bit_count", ballot_bit_count_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_bit_count");
+    error |= rft_ballot.run_impl<
         cl_uint4,
         BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
-        "test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_inclusive_scan");
+    error |= rft_ballot.run_impl<
         cl_uint4,
         BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
-        "test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_exclusive_scan");
+    error |= rft_ballot.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
-        "test_sub_group_ballot_find_lsb", ballot_find_lsb_source);
-    error |= rft.run_impl<
+        "sub_group_ballot_find_lsb");
+    error |= rft_ballot.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
-        "test_sub_group_ballot_find_msb", ballot_find_msb_source);
+        "sub_group_ballot_find_msb");
     return error;
 }
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
index 87507e3792..11fcebc4a2 100644
--- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -22,149 +22,17 @@
 #define CLUSTER_SIZE_STR "4"
 
 namespace {
-static const char *redadd_clustered_source =
-    "__kernel void test_redadd_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-
-static const char *redmax_clustered_source =
-    "__kernel void test_redmax_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-
-static const char *redmin_clustered_source =
-    "__kernel void test_redmin_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-
-static const char *redmul_clustered_source =
-    "__kernel void test_redmul_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-
-static const char *redand_clustered_source =
-    "__kernel void test_redand_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-
-static const char *redor_clustered_source =
-    "__kernel void test_redor_clustered(const __global Type *in, __global int4 "
-    "*xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-
-static const char *redxor_clustered_source =
-    "__kernel void test_redxor_clustered(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
-    ");\n"
-    "}\n";
-
-static const char *redand_clustered_logical_source =
-    "__kernel void test_redand_clustered_logical(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
-    ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = "
-    "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
-    "}\n";
-
-static const char *redor_clustered_logical_source =
-    "__kernel void test_redor_clustered_logical(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if (sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
-    ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = "
-    "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
-    "}\n";
-
-static const char *redxor_clustered_logical_source =
-    "__kernel void test_redxor_clustered_logical(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    xy[gid].w = 0;\n"
-    "    if ( sizeof(in[gid]) == "
-    "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
-    ")))\n"
-    "    {xy[gid].w = sizeof(in[gid]);}\n"
-    "    out[gid] = "
-    "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
-    "}\n";
-
+std::string sub_group_clustered_reduce_source = R"(
+__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        xy[gid].w = 0;
+        if (sizeof(in[gid]) == sizeof(%s(in[gid], )" CLUSTER_SIZE_STR R"())) {
+            xy[gid].w = sizeof(in[gid]);
+        }
+        out[gid] = %s(in[gid], )" CLUSTER_SIZE_STR R"();
+}       
+)";
 
 // DESCRIPTION:
 // Test for reduce cluster functions
@@ -267,34 +135,34 @@ template <typename T>
 int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
-        "test_redadd_clustered", redadd_clustered_source);
+        "sub_group_clustered_reduce_add");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
-        "test_redmax_clustered", redmax_clustered_source);
+        "sub_group_clustered_reduce_max");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
-        "test_redmin_clustered", redmin_clustered_source);
+        "sub_group_clustered_reduce_min");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
-        "test_redmul_clustered", redmul_clustered_source);
+        "sub_group_clustered_reduce_mul");
     return error;
 }
 template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
-        "test_redand_clustered", redand_clustered_source);
+        "sub_group_clustered_reduce_and");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
-        "test_redor_clustered", redor_clustered_source);
+        "sub_group_clustered_reduce_or");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
-        "test_redxor_clustered", redxor_clustered_source);
+        "sub_group_clustered_reduce_xor");
     return error;
 }
 template <typename T>
 int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
-        "test_redand_clustered_logical", redand_clustered_logical_source);
+        "sub_group_clustered_reduce_logical_and");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
-        "test_redor_clustered_logical", redor_clustered_logical_source);
+        "sub_group_clustered_reduce_logical_or");
     error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
-        "test_redxor_clustered_logical", redxor_clustered_logical_source);
+        "sub_group_clustered_reduce_logical_xor");
 
     return error;
 }
@@ -311,9 +179,11 @@ int test_subgroup_functions_clustered_reduce(cl_device_id device,
                  "device, skipping test.\n");
         return TEST_SKIPPED_ITSELF;
     }
+
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
     WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_clustered_reduce_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp
index b281f6187c..dbe2462337 100644
--- a/test_conformance/subgroups/test_subgroup_extended_types.cpp
+++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp
@@ -24,30 +24,30 @@ namespace {
 template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
-        "test_bcast", bcast_source);
+        "sub_group_broadcast");
     return error;
 }
 
 template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
 {
-    int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
-                                                               redadd_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
-                                                            redmax_source);
-    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
-                                                            redmin_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
-                                                             scinadd_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
-                                                             scinmax_source);
-    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
-                                                             scinmin_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
-                                                             scexadd_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
-                                                             scexmax_source);
-    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
-                                                             scexmin_source);
+    int error =
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
+    error |=
+        rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_inclusive_add");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_inclusive_max");
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_inclusive_min");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
+        "sub_group_scan_exclusive_add");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
+        "sub_group_scan_exclusive_max");
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
+        "sub_group_scan_exclusive_min");
     return error;
 }
 
@@ -65,11 +65,15 @@ int test_subgroup_functions_extended_types(cl_device_id device,
                  "device, skipping test.\n");
         return TEST_SKIPPED_ITSELF;
     }
+
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
     WorkGroupParams test_params(global_work_size, local_work_size);
-    RunTestForType rft(device, context, queue, num_elements, test_params);
+    test_params.save_kernel_source(sub_group_reduction_scan_source);
+    test_params.save_kernel_source(sub_group_generic_source,
+                                   "sub_group_broadcast");
 
+    RunTestForType rft(device, context, queue, num_elements, test_params);
     int error = run_broadcast_for_extended_type<cl_uint2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
     error |= run_broadcast_for_extended_type<cl_uint4>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
index 6c44249edb..bb257bcdf1 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -17,336 +17,29 @@
 #include "subhelpers.h"
 #include "harness/typeWrappers.h"
 #include "subgroup_common_templates.h"
+#include <cstdio>
 
 namespace {
 
-static const char *scinadd_non_uniform_source = R"(
-    __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+std::string sub_group_non_uniform_arithmetic_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
         int gid = get_global_id(0);
         XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
-            }
-    }
-)";
-
-static const char *scinmax_non_uniform_source = R"(
-    __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
-            }
-    }
-)";
-
-static const char *scinmin_non_uniform_source = R"(
-    __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
-            }
-    }
-)";
-
-static const char *scinmul_non_uniform_source = R"(
-    __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
-            }
-    }
-)";
-
-static const char *scinand_non_uniform_source = R"(
-    __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
-            }
-    }
-)";
-
-static const char *scinor_non_uniform_source = R"(
-    __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
-            }
-    }
-)";
-
-static const char *scinxor_non_uniform_source = R"(
-    __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
-            }
-    }
-)";
-
-static const char *scinand_non_uniform_logical_source = R"(
-    __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
-            }
-    }
-)";
-
-static const char *scinor_non_uniform_logical_source = R"(
-    __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
-            }
-    }
-)";
-
-static const char *scinxor_non_uniform_logical_source = R"(
-    __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
-            }
-    }
-)";
-
-static const char *scexadd_non_uniform_source = R"(
-    __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
-            }
-    }
-)";
-
-static const char *scexmax_non_uniform_source = R"(
-    __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
-            }
-    }
-)";
-
-static const char *scexmin_non_uniform_source = R"(
-    __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
-            }
-    }
-)";
-
-static const char *scexmul_non_uniform_source = R"(
-    __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
-            }
-    }
-)";
-
-static const char *scexand_non_uniform_source = R"(
-    __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
-            }
-    }
-)";
-
-static const char *scexor_non_uniform_source = R"(
-    __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
-            }
-    }
-)";
-
-static const char *scexxor_non_uniform_source = R"(
-    __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
-            }
-    }
-)";
-
-static const char *scexand_non_uniform_logical_source = R"(
-    __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
-            }
-    }
-)";
-
-static const char *scexor_non_uniform_logical_source = R"(
-    __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
-            }
-    }
-)";
-
-static const char *scexxor_non_uniform_logical_source = R"(
-    __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
-            }
-    }
-)";
-
-static const char *redadd_non_uniform_source = R"(
-    __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
-            }
-    }
-)";
-
-static const char *redmax_non_uniform_source = R"(
-    __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
-            }
-    }
-)";
-
-static const char *redmin_non_uniform_source = R"(
-    __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
-            }
-    }
-)";
-
-static const char *redmul_non_uniform_source = R"(
-    __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
-            }
-    }
-)";
-
-static const char *redand_non_uniform_source = R"(
-    __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
-            }
-    }
-)";
-
-static const char *redor_non_uniform_source = R"(
-    __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
-            }
-    }
-)";
-
-static const char *redxor_non_uniform_source = R"(
-    __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
-            }
-    }
-)";
-
-static const char *redand_non_uniform_logical_source = R"(
-    __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
-            }
-    }
-)";
-
-static const char *redor_non_uniform_logical_source = R"(
-    __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
-            }
-    }
-)";
-
-static const char *redxor_non_uniform_logical_source = R"(
-    __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
-            }
+        uint subgroup_local_id = get_sub_group_local_id();
+        uint elect_work_item = 1 << (subgroup_local_id % 32);
+        uint work_item_mask;
+        if(subgroup_local_id < 32) {
+            work_item_mask = work_item_mask_vector.x;
+        } else if(subgroup_local_id < 64) {
+            work_item_mask = work_item_mask_vector.y;
+        } else if(subgroup_local_id < 96) {
+            work_item_mask = work_item_mask_vector.w;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.z;
+        }
+        if (elect_work_item & work_item_mask){
+            out[gid] = %s(in[gid]);
+        }
     }
 )";
 
@@ -354,52 +47,52 @@ template <typename T>
 int run_functions_add_mul_max_min_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
-        "test_scinadd_non_uniform", scinadd_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_add");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
-        "test_scinmul_non_uniform", scinmul_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_mul");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
-        "test_scinmax_non_uniform", scinmax_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_max");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
-        "test_scinmin_non_uniform", scinmin_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_min");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
-        "test_scexadd_non_uniform", scexadd_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_add");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
-        "test_scexmul_non_uniform", scexmul_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_mul");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
-        "test_scexmax_non_uniform", scexmax_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_max");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
-        "test_scexmin_non_uniform", scexmin_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_min");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
-        "test_redadd_non_uniform", redadd_non_uniform_source);
+        "sub_group_non_uniform_reduce_add");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
-        "test_redmul_non_uniform", redmul_non_uniform_source);
+        "sub_group_non_uniform_reduce_mul");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
-        "test_redmax_non_uniform", redmax_non_uniform_source);
+        "sub_group_non_uniform_reduce_max");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
-        "test_redmin_non_uniform", redmin_non_uniform_source);
+        "sub_group_non_uniform_reduce_min");
     return error;
 }
 
 template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
-        "test_scinand_non_uniform", scinand_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_and");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
-        "test_scinor_non_uniform", scinor_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_or");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
-        "test_scinxor_non_uniform", scinxor_non_uniform_source);
+        "sub_group_non_uniform_scan_inclusive_xor");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
-        "test_scexand_non_uniform", scexand_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_and");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
-        "test_scexor_non_uniform", scexor_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_or");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
-        "test_scexxor_non_uniform", scexxor_non_uniform_source);
+        "sub_group_non_uniform_scan_exclusive_xor");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
-        "test_redand_non_uniform", redand_non_uniform_source);
+        "sub_group_non_uniform_reduce_and");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
-        "test_redor_non_uniform", redor_non_uniform_source);
+        "sub_group_non_uniform_reduce_or");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
-        "test_redxor_non_uniform", redxor_non_uniform_source);
+        "sub_group_non_uniform_reduce_xor");
     return error;
 }
 
@@ -407,23 +100,23 @@ template <typename T>
 int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
-        "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_inclusive_logical_and");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
-        "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_inclusive_logical_or");
     error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
-        "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_inclusive_logical_xor");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
-        "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_exclusive_logical_and");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
-        "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_exclusive_logical_or");
     error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
-        "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
+        "sub_group_non_uniform_scan_exclusive_logical_xor");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
-        "test_redand_non_uniform_logical", redand_non_uniform_logical_source);
+        "sub_group_non_uniform_reduce_logical_and");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
-        "test_redor_non_uniform_logical", redor_non_uniform_logical_source);
+        "sub_group_non_uniform_reduce_logical_or");
     error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
-        "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
+        "sub_group_non_uniform_reduce_logical_xor");
     return error;
 }
 
@@ -441,13 +134,11 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
                  "this device, skipping test.\n");
         return TEST_SKIPPED_ITSELF;
     }
-    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
-                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
-                                 0x00ffff00, 0x80000000, 0xaaaaaaaa };
 
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size, masks);
+    WorkGroupParams test_params(global_work_size, local_work_size, true);
+    test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
index 484e9b6b49..f956960b0d 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -28,7 +28,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         int non_uniform_size = ng % nw;
         ng = ng / nw;
@@ -40,9 +39,11 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
                  operation_names(operation));
 
         log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d work item mask = 0x%x data type (%s)\n",
-                 test_params.global_workgroup_size, nw, ns, work_items_mask,
+                 "size = %d data type (%s)\n",
+                 test_params.global_workgroup_size, nw, ns,
                  TypeManager<T>::name());
+        log_info("               work items mask: %s\n",
+                 test_params.work_items_mask.to_string().c_str());
         if (non_uniform_size)
         {
             log_info("  non uniform work group size mode ON\n");
@@ -99,7 +100,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
-        uint32_t work_items_mask = test_params.work_items_mask;
         int nj = (nw + ns - 1) / ns;
         cl_int tr, rr;
         int non_uniform_size = ng % nw;
@@ -141,8 +141,7 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
                 std::set<int> active_work_items;
                 for (i = 0; i < n; ++i)
                 {
-                    uint32_t check_work_item = 1 << (i % 32);
-                    if (work_items_mask & check_work_item)
+                    if (test_params.work_items_mask.test(i))
                     {
                         active_work_items.insert(i);
                         switch (operation)
@@ -215,46 +214,47 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
         return TEST_PASS;
     }
 };
-static const char *elect_source = R"(
-    __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_elect();
-            }
-    }
-)";
-
-static const char *non_uniform_any_source = R"(
-    __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
-        int gid = get_global_id(0);
-        XY(xy,gid);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_any(in[gid]);
-            }
-    }
-)";
 
-static const char *non_uniform_all_source = R"(
-    __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
+std::string sub_group_elect_source = R"(
+    __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
         int gid = get_global_id(0);
         XY(xy,gid);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_all(in[gid]);
-            }
+        uint subgroup_local_id = get_sub_group_local_id();
+        uint elect_work_item = 1 << (subgroup_local_id % 32);
+        uint work_item_mask;
+        if(subgroup_local_id < 32) {
+            work_item_mask = work_item_mask_vector.x;
+        } else if(subgroup_local_id < 64) {
+            work_item_mask = work_item_mask_vector.y;
+        } else if(subgroup_local_id < 96) {
+            work_item_mask = work_item_mask_vector.w;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.z;
+        }
+        if (elect_work_item & work_item_mask){
+            out[gid] = sub_group_elect();
+        }
     }
 )";
 
-static const char *non_uniform_all_equal_source = R"(
-    __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
+std::string sub_group_non_uniform_any_all_all_equal_source = R"(
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
         int gid = get_global_id(0);
         XY(xy,gid);
-        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
-            if (elect_work_item & WORK_ITEMS_MASK){
-                out[gid] = sub_group_non_uniform_all_equal(in[gid]);
+        uint subgroup_local_id = get_sub_group_local_id();
+        uint elect_work_item = 1 << (subgroup_local_id % 32);
+        uint work_item_mask;
+        if(subgroup_local_id < 32) {
+            work_item_mask = work_item_mask_vector.x;
+        } else if(subgroup_local_id < 64) {
+            work_item_mask = work_item_mask_vector.y;
+        } else if(subgroup_local_id < 96) {
+            work_item_mask = work_item_mask_vector.w;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.z;
+        }
+        if (elect_work_item & work_item_mask){
+                out[gid] = %s(in[gid]);
             }
     }
 )";
@@ -262,7 +262,7 @@ static const char *non_uniform_all_equal_source = R"(
 template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
 {
     int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
-        "test_non_uniform_all_equal", non_uniform_all_equal_source);
+        "sub_group_non_uniform_all_equal");
     return error;
 }
 }
@@ -278,12 +278,13 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
                  "device, skipping test.\n");
         return TEST_SKIPPED_ITSELF;
     }
-    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
-                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
-                                 0x00ffff00, 0x80000000 };
+
     constexpr size_t global_work_size = 170;
     constexpr size_t local_work_size = 64;
-    WorkGroupParams test_params(global_work_size, local_work_size, masks);
+    WorkGroupParams test_params(global_work_size, local_work_size, true);
+    test_params.save_kernel_source(
+        sub_group_non_uniform_any_all_all_equal_source);
+    test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_vote_all_equal_for_type<cl_int>(rft);
@@ -295,10 +296,10 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
     error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
 
     error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
-        "test_non_uniform_all", non_uniform_all_source);
+        "sub_group_non_uniform_all");
     error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
-        "test_elect", elect_source);
+        "sub_group_elect");
     error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
-        "test_non_uniform_any", non_uniform_any_source);
+        "sub_group_non_uniform_any");
     return error;
 }
diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp
index 37b27cedb5..56231cbfa9 100644
--- a/test_conformance/subgroups/test_subgroup_shuffle.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp
@@ -15,38 +15,19 @@
 //
 #include "procs.h"
 #include "subhelpers.h"
+#include "subgroup_common_kernels.h"
 #include "subgroup_common_templates.h"
 #include "harness/typeWrappers.h"
 #include <bitset>
 
 namespace {
 
-static const char* shuffle_xor_source =
-    "__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
-    "}\n";
-
-static const char* shuffle_source =
-    "__kernel void test_sub_group_shuffle(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle(x, xy[gid].z);"
-    "}\n";
-
 template <typename T> int run_shuffle_for_type(RunTestForType rft)
 {
-    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
-        "test_sub_group_shuffle", shuffle_source);
+    int error =
+        rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>("sub_group_shuffle");
     error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
-        "test_sub_group_shuffle_xor", shuffle_xor_source);
+        "sub_group_shuffle_xor");
     return error;
 }
 
@@ -61,9 +42,11 @@ int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
                  "skipping test.\n");
         return TEST_SKIPPED_ITSELF;
     }
+
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
     WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_generic_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_shuffle_for_type<cl_int>(rft);
diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
index 11401e80bc..caa1dccca9 100644
--- a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
@@ -15,37 +15,19 @@
 //
 #include "procs.h"
 #include "subhelpers.h"
+#include "subgroup_common_kernels.h"
 #include "subgroup_common_templates.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 
 namespace {
 
-static const char* shuffle_down_source =
-    "__kernel void test_sub_group_shuffle_down(const __global Type *in, "
-    "__global int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
-    "}\n";
-static const char* shuffle_up_source =
-    "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
-    "int4 *xy, __global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
-    "}\n";
-
 template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
 {
-    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
-        "test_sub_group_shuffle_up", shuffle_up_source);
+    int error =
+        rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>("sub_group_shuffle_up");
     error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
-        "test_sub_group_shuffle_down", shuffle_down_source);
+        "sub_group_shuffle_down");
     return error;
 }
 
@@ -62,9 +44,11 @@ int test_subgroup_functions_shuffle_relative(cl_device_id device,
                  "device, skipping test.\n");
         return TEST_SKIPPED_ITSELF;
     }
+
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
     WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.save_kernel_source(sub_group_generic_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
     int error = run_shuffle_relative_for_type<cl_int>(rft);

From 7147d072c7bbed99e429cb8fe3e86139a12ef8bb Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Mon, 4 Oct 2021 15:42:44 +0200
Subject: [PATCH 134/158] Remove space character from extension name (#1336)

---
 test_common/gl/setup_x11.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/test_common/gl/setup_x11.cpp b/test_common/gl/setup_x11.cpp
index c54ecdec33..7efda3d2ac 100644
--- a/test_common/gl/setup_x11.cpp
+++ b/test_common/gl/setup_x11.cpp
@@ -90,10 +90,17 @@ class X11GLEnvironment : public GLEnvironment
         }
 
         for (int i=0; i<(int)num_of_devices; i++) {
-            if (!is_extension_available(devices[i], "cl_khr_gl_sharing ")) {
-                log_info("Device %d of %d does not support required extension cl_khr_gl_sharing.\n", i+1, num_of_devices);
-            } else {
-                log_info("Device %d of %d supports required extension cl_khr_gl_sharing.\n", i+1, num_of_devices);
+            if (!is_extension_available(devices[i], "cl_khr_gl_sharing"))
+            {
+                log_info("Device %d of %d does not support required extension "
+                         "cl_khr_gl_sharing.\n",
+                         i + 1, num_of_devices);
+            }
+            else
+            {
+                log_info("Device %d of %d supports required extension "
+                         "cl_khr_gl_sharing.\n",
+                         i + 1, num_of_devices);
                 found_valid_device = 1;
                 m_devices[m_device_count++] = devices[i];
             }

From 410f46f49fcec65d18d30b0df7a1d7ae0a4cd5db Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Wed, 3 Nov 2021 16:36:36 +0000
Subject: [PATCH 135/158] Add testing of sub_group_broadcast for (u)char and
 (u)short types (#1347)

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_conformance/subgroups/test_subgroup_extended_types.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp
index dbe2462337..c9e6bb616f 100644
--- a/test_conformance/subgroups/test_subgroup_extended_types.cpp
+++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp
@@ -108,22 +108,26 @@ int test_subgroup_functions_extended_types(cl_device_id device,
     error |= run_broadcast_for_extended_type<cl_double8>(rft);
     error |= run_broadcast_for_extended_type<cl_double16>(rft);
 
+    error |= run_broadcast_for_extended_type<cl_ushort>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_ushort3>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort4>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort8>(rft);
     error |= run_broadcast_for_extended_type<cl_ushort16>(rft);
+    error |= run_broadcast_for_extended_type<cl_short>(rft);
     error |= run_broadcast_for_extended_type<cl_short2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_short3>(rft);
     error |= run_broadcast_for_extended_type<cl_short4>(rft);
     error |= run_broadcast_for_extended_type<cl_short8>(rft);
     error |= run_broadcast_for_extended_type<cl_short16>(rft);
 
+    error |= run_broadcast_for_extended_type<cl_uchar>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_uchar3>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar4>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar8>(rft);
     error |= run_broadcast_for_extended_type<cl_uchar16>(rft);
+    error |= run_broadcast_for_extended_type<cl_char>(rft);
     error |= run_broadcast_for_extended_type<cl_char2>(rft);
     error |= run_broadcast_for_extended_type<subgroups::cl_char3>(rft);
     error |= run_broadcast_for_extended_type<cl_char4>(rft);

From e9cd9a446e1b36a02f6e8f959256d5f96eda21a4 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Mon, 8 Nov 2021 11:00:50 +0000
Subject: [PATCH 136/158] Remove excessive logging in subgroup tests (#1343)

This also adds some missing data type logging to the
subgroup_functions_non_uniform_vote tests.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 .../subgroups/subgroup_common_templates.h     | 37 ------------
 .../subgroups/test_subgroup_ballot.cpp        | 14 -----
 .../test_subgroup_non_uniform_vote.cpp        | 58 +++++++------------
 3 files changed, 21 insertions(+), 88 deletions(-)

diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index 5c5f9560ac..349f810098 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -80,7 +80,6 @@ template <typename Ty, SubgroupsBroadcastOp operation> struct BC
                  TypeManager<Ty>::name());
         if (non_uniform_size)
         {
-            log_info("  non uniform work group size mode ON\n");
             ng++;
         }
         for (k = 0; k < ng; ++k)
@@ -581,14 +580,6 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
             : func_name = "sub_group_scan_exclusive";
         log_info("  %s_%s(%s)...\n", func_name.c_str(),
                  operation_names(operation), TypeManager<Ty>::name());
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d \n",
-                 test_params.global_workgroup_size, nw, ns);
-        if (test_params.work_items_mask.any())
-        {
-            log_info("               work items mask: %s\n",
-                     test_params.work_items_mask.to_string().c_str());
-        }
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
@@ -637,16 +628,10 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
                 }
                 if (active_work_items.empty())
                 {
-                    log_info("  No acitve workitems in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
                     continue;
                 }
                 else if (active_work_items.size() == 1)
                 {
-                    log_info("  One active workitem in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
                     continue;
                 }
                 else
@@ -702,14 +687,6 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
         log_info("  %s_%s(%s)...\n", func_name.c_str(),
                  operation_names(operation), TypeManager<Ty>::name());
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d \n",
-                 test_params.global_workgroup_size, nw, ns);
-        if (test_params.work_items_mask.any())
-        {
-            log_info("               work items mask: %s\n",
-                     test_params.work_items_mask.to_string().c_str());
-        }
     }
 
     static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -765,9 +742,6 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
                 }
                 if (active_work_items.empty())
                 {
-                    log_info("  No acitve workitems in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
                     continue;
                 }
                 else
@@ -828,14 +802,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
             : func_name = "sub_group_reduce";
         log_info("  %s_%s(%s)...\n", func_name.c_str(),
                  operation_names(operation), TypeManager<Ty>::name());
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d \n",
-                 test_params.global_workgroup_size, nw, ns);
-        if (test_params.work_items_mask.any())
-        {
-            log_info("               work items mask: %s\n",
-                     test_params.work_items_mask.to_string().c_str());
-        }
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
@@ -894,9 +860,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
 
                 if (active_work_items.empty())
                 {
-                    log_info("  No acitve workitems in workgroup id = %d "
-                             "subgroup id = %d - no calculation\n",
-                             k, j);
                     continue;
                 }
 
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index 2bd54e4355..ac90bad7a5 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -31,10 +31,6 @@ template <typename Ty> struct BALLOT
         int sbs = test_params.subgroup_size;
         int non_uniform_size = gws % lws;
         log_info("  sub_group_ballot...\n");
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
     }
 
     static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -117,11 +113,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
         log_info("  sub_group_%s(%s)...\n", operation_names(operation),
                  TypeManager<Ty>::name());
 
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
-
         for (wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
             for (sb_id = 0; sb_id < sb_number; ++sb_id)
@@ -275,10 +266,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
         int sbs = test_params.subgroup_size;
         int non_uniform_size = gws % lws;
         log_info("  sub_group_inverse_ballot...\n");
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
         // no work here
     }
 
@@ -379,7 +366,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                  TypeManager<Ty>::name());
         if (non_uniform_size)
         {
-            log_info("  non uniform work group size mode ON\n");
             wg_number++;
         }
         int e;
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
index f956960b0d..835de25d7c 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -34,20 +34,10 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
         int last_subgroup_size = 0;
         ii = 0;
 
-        log_info("  sub_group_%s%s... \n",
+        log_info("  sub_group_%s%s(%s)... \n",
                  (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
-                 operation_names(operation));
+                 operation_names(operation), TypeManager<T>::name());
 
-        log_info("  test params: global size = %d local size = %d subgroups "
-                 "size = %d data type (%s)\n",
-                 test_params.global_workgroup_size, nw, ns,
-                 TypeManager<T>::name());
-        log_info("               work items mask: %s\n",
-                 test_params.work_items_mask.to_string().c_str());
-        if (non_uniform_size)
-        {
-            log_info("  non uniform work group size mode ON\n");
-        }
         if (operation == NonUniformVoteOp::elect) return;
 
         for (k = 0; k < ng; ++k)
@@ -171,34 +161,28 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
                 }
                 if (active_work_items.empty())
                 {
-                    log_info("  no one workitem acitve... in workgroup id = %d "
-                             "subgroup id = %d\n",
-                             k, j);
+                    continue;
                 }
-                else
+                auto lowest_active = active_work_items.begin();
+                for (const int &active_work_item : active_work_items)
                 {
-                    auto lowest_active = active_work_items.begin();
-                    for (const int &active_work_item : active_work_items)
+                    i = active_work_item;
+                    if (operation == NonUniformVoteOp::elect)
                     {
-                        i = active_work_item;
-                        if (operation == NonUniformVoteOp::elect)
-                        {
-                            i == *lowest_active ? tr = 1 : tr = 0;
-                        }
+                        i == *lowest_active ? tr = 1 : tr = 0;
+                    }
 
-                        // normalize device values on host, non zero set 1.
-                        rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
+                    // normalize device values on host, non zero set 1.
+                    rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
 
-                        if (rr != tr)
-                        {
-                            log_error("ERROR: sub_group_%s() \n",
-                                      operation_names(operation));
-                            log_error(
-                                "mismatch for work item %d sub group %d in "
-                                "work group %d. Expected: %d Obtained: %d\n",
-                                i, j, k, tr, rr);
-                            return TEST_FAIL;
-                        }
+                    if (rr != tr)
+                    {
+                        log_error("ERROR: sub_group_%s() \n",
+                                  operation_names(operation));
+                        log_error("mismatch for work item %d sub group %d in "
+                                  "work group %d. Expected: %d Obtained: %d\n",
+                                  i, j, k, tr, rr);
+                        return TEST_FAIL;
                     }
                 }
             }
@@ -208,9 +192,9 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
             m += 4 * nw;
         }
 
-        log_info("  sub_group_%s%s... passed\n",
+        log_info("  sub_group_%s%s(%s)... passed\n",
                  (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
-                 operation_names(operation));
+                 operation_names(operation), TypeManager<T>::name());
         return TEST_PASS;
     }
 };

From 1116a71ba2994ecf761d2ab853de7de51448500d Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Tue, 16 Nov 2021 11:27:04 +0000
Subject: [PATCH 137/158] Improve error handling in subgroup tests (#1352)

* MPGCOMP-14761 Improve error handling in subgroup tests

Signed-off-by: Stuart Brady <stuart.brady@arm.com>

* Add missing newline
---
 test_common/harness/errorHelpers.h            |  1 +
 .../subgroups/subgroup_common_templates.h     | 20 +++----
 test_conformance/subgroups/subhelpers.h       | 58 ++++++++++++-------
 test_conformance/subgroups/test_barrier.cpp   | 10 ++--
 test_conformance/subgroups/test_ifp.cpp       | 12 ++--
 test_conformance/subgroups/test_subgroup.cpp  |  4 +-
 .../subgroups/test_subgroup_ballot.cpp        | 20 +++----
 .../test_subgroup_clustered_reduce.cpp        |  4 +-
 .../test_subgroup_non_uniform_vote.cpp        |  4 +-
 9 files changed, 74 insertions(+), 59 deletions(-)

diff --git a/test_common/harness/errorHelpers.h b/test_common/harness/errorHelpers.h
index d59bc78de0..80eb3b58ef 100644
--- a/test_common/harness/errorHelpers.h
+++ b/test_common/harness/errorHelpers.h
@@ -62,6 +62,7 @@ static int vlog_win32(const char *format, ...);
         return TEST_FAIL;                                                      \
     }
 #define test_error(errCode, msg) test_error_ret(errCode, msg, errCode)
+#define test_error_fail(errCode, msg) test_error_ret(errCode, msg, TEST_FAIL)
 #define test_error_ret(errCode, msg, retValue)                                 \
     {                                                                          \
         auto errCodeResult = errCode;                                          \
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index 349f810098..cfe02c2f40 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -168,8 +168,8 @@ template <typename Ty, SubgroupsBroadcastOp operation> struct BC
         }
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int ii, i, j, k, l, n;
         int ng = test_params.global_workgroup_size;
@@ -499,8 +499,8 @@ template <typename Ty, ShuffleOp operation> struct SHF
         }
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int ii, i, j, k, l, n;
         int nw = test_params.local_workgroup_size;
@@ -583,8 +583,8 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
@@ -689,8 +689,8 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
                  operation_names(operation), TypeManager<Ty>::name());
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
@@ -805,8 +805,8 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index 6d32928aed..bd4b6d61e9 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -1375,25 +1375,31 @@ static int run_kernel(cl_context context, cl_command_queue queue,
 // Driver for testing a single built in function
 template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
 {
-    static int mrun(cl_device_id device, cl_context context,
-                    cl_command_queue queue, int num_elements, const char *kname,
-                    const char *src, WorkGroupParams test_params)
+    static test_status mrun(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int num_elements,
+                            const char *kname, const char *src,
+                            WorkGroupParams test_params)
     {
-        int error = TEST_PASS;
+        test_status combined_error = TEST_SKIPPED_ITSELF;
         for (auto &mask : test_params.all_work_item_masks)
         {
             test_params.work_items_mask = mask;
-            error |= run(device, context, queue, num_elements, kname, src,
-                         test_params);
+            test_status error = run(device, context, queue, num_elements, kname,
+                                    src, test_params);
+
+            if (error == TEST_FAIL
+                || (error == TEST_PASS && combined_error != TEST_FAIL))
+                combined_error = error;
         }
-        return error;
+        return combined_error;
     };
-    static int run(cl_device_id device, cl_context context,
-                   cl_command_queue queue, int num_elements, const char *kname,
-                   const char *src, WorkGroupParams test_params)
+    static test_status run(cl_device_id device, cl_context context,
+                           cl_command_queue queue, int num_elements,
+                           const char *kname, const char *src,
+                           WorkGroupParams test_params)
     {
         size_t tmp;
-        int error;
+        cl_int error;
         int subgroup_size, num_subgroups;
         size_t realSize;
         size_t global = test_params.global_workgroup_size;
@@ -1434,7 +1440,7 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
         if (!TypeManager<Ty>::type_supported(device))
         {
             log_info("Data type not supported : %s\n", TypeManager<Ty>::name());
-            return 0;
+            return TEST_SKIPPED_ITSELF;
         }
         else
         {
@@ -1450,7 +1456,7 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
 
         error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
                                 (void *)&platform, NULL);
-        test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
+        test_error_fail(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
         if (test_params.use_core_subgroups)
         {
             kernel_sstr
@@ -1465,12 +1471,12 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
 
         error = create_single_kernel_helper(context, &program, &kernel, 1,
                                             &kernel_src, kname);
-        if (error != 0) return error;
+        if (error != CL_SUCCESS) return TEST_FAIL;
 
         // Determine some local dimensions to use for the test.
         error = get_max_common_work_group_size(
             context, kernel, test_params.global_workgroup_size, &local);
-        test_error(error, "get_max_common_work_group_size failed");
+        test_error_fail(error, "get_max_common_work_group_size failed");
 
         // Limit it a bit so we have muliple work groups
         // Ideally this will still be large enough to give us multiple
@@ -1543,7 +1549,7 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
                            input_array_size * sizeof(Ty), sgmap.data(),
                            global * sizeof(cl_int4), odata.data(),
                            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        test_error(error, "Running kernel first time failed");
+        test_error_fail(error, "Running kernel first time failed");
 
         // Generate the desired input for the kernel
 
@@ -1553,13 +1559,18 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
                            input_array_size * sizeof(Ty), sgmap.data(),
                            global * sizeof(cl_int4), odata.data(),
                            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        test_error(error, "Running kernel second time failed");
+        test_error_fail(error, "Running kernel second time failed");
 
         // Check the result
-        error = Fns::chk(idata.data(), odata.data(), mapin.data(),
-                         mapout.data(), sgmap.data(), test_params);
-        test_error(error, "Data verification failed");
-        return TEST_PASS;
+        test_status status = Fns::chk(idata.data(), odata.data(), mapin.data(),
+                                      mapout.data(), sgmap.data(), test_params);
+        // Detailed failure and skip messages should be logged by Fns::gen
+        // and Fns::chk.
+        if (status == TEST_FAIL)
+        {
+            test_fail("Data verification failed\n");
+        }
+        return status;
     }
 };
 
@@ -1625,7 +1636,10 @@ struct RunTestForType
                                     test_params_);
         }
 
-        return error;
+        // If we return TEST_SKIPPED_ITSELF here, then an entire suite may be
+        // reported as having been skipped even if some tests within it
+        // passed, as the status codes are erroneously ORed together:
+        return error == TEST_FAIL ? TEST_FAIL : TEST_PASS;
     }
 
 private:
diff --git a/test_conformance/subgroups/test_barrier.cpp b/test_conformance/subgroups/test_barrier.cpp
index 47e42f65af..b570e92292 100644
--- a/test_conformance/subgroups/test_barrier.cpp
+++ b/test_conformance/subgroups/test_barrier.cpp
@@ -92,8 +92,8 @@ template <int Which> struct BAR
         }
     }
 
-    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my,
+                           cl_int *m, const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;
@@ -133,7 +133,7 @@ template <int Which> struct BAR
                                   "id %d in sub group %d in group %d expected "
                                   "%d got %d\n",
                                   i, j, k, tr, rr);
-                        return -1;
+                        return TEST_FAIL;
                     }
                 }
             }
@@ -143,7 +143,7 @@ template <int Which> struct BAR
             m += 2 * nw;
         }
 
-        return 0;
+        return TEST_PASS;
     }
 };
 
@@ -187,4 +187,4 @@ int test_barrier_functions_ext(cl_device_id device, cl_context context,
     }
 
     return test_barrier_functions(device, context, queue, num_elements, false);
-}
\ No newline at end of file
+}
diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp
index fccaa8c79d..f6c5227dfc 100644
--- a/test_conformance/subgroups/test_ifp.cpp
+++ b/test_conformance/subgroups/test_ifp.cpp
@@ -245,8 +245,8 @@ struct IFP
         }
     }
 
-    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
-                   const WorkGroupParams &test_params)
+    static test_status chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
+                           const WorkGroupParams &test_params)
     {
         int i, k;
         int nw = test_params.local_workgroup_size;
@@ -255,8 +255,8 @@ struct IFP
         int nj = (nw + ns - 1) / ns;
         ng = ng / nw;
 
-        // We need at least 2 sub groups per group for this tes
-        if (nj == 1) return 0;
+        // We need at least 2 sub groups per group for this test
+        if (nj == 1) return TEST_SKIPPED_ITSELF;
 
         log_info("  independent forward progress...\n");
 
@@ -270,14 +270,14 @@ struct IFP
                     log_error(
                         "ERROR: mismatch at element %d in work group %d\n", i,
                         k);
-                    return -1;
+                    return TEST_FAIL;
                 }
             }
             x += nj * (NUM_LOC + 1);
             y += NUM_LOC;
         }
 
-        return 0;
+        return TEST_PASS;
     }
 };
 
diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp
index 63bfc4532c..eefca5f8a4 100644
--- a/test_conformance/subgroups/test_subgroup.cpp
+++ b/test_conformance/subgroups/test_subgroup.cpp
@@ -68,8 +68,8 @@ template <NonUniformVoteOp operation> struct AA
         }
     }
 
-    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my,
+                           cl_int *m, const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
         int ng = test_params.global_workgroup_size;
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index ac90bad7a5..0228e82c2a 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -33,8 +33,8 @@ template <typename Ty> struct BALLOT
         log_info("  sub_group_ballot...\n");
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
@@ -146,8 +146,8 @@ template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
         }
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int wi_id, wg_id, l, sb_id;
         int gws = test_params.global_workgroup_size;
@@ -269,8 +269,8 @@ template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
         // no work here
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
@@ -444,8 +444,8 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
         return mask;
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
@@ -617,8 +617,8 @@ template <typename Ty, BallotOp operation> struct SMASK
         }
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
index 11fcebc4a2..ad9e1ff228 100644
--- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -50,8 +50,8 @@ template <typename Ty, ArithmeticOp operation> struct RED_CLU
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
index 835de25d7c..b21a9f7eed 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -83,8 +83,8 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
         }
     }
 
-    static int chk(T *x, T *y, T *mx, T *my, cl_int *m,
-                   const WorkGroupParams &test_params)
+    static test_status chk(T *x, T *y, T *mx, T *my, cl_int *m,
+                           const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
         int nw = test_params.local_workgroup_size;

From 1c6dbc23e74afeb5dcfdf2de2d69734c6b02a845 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Tue, 16 Nov 2021 14:03:06 +0000
Subject: [PATCH 138/158] Clean up logging in cl_khr_subgroup_ballot tests
 (#1351)

The tests were logging scalar results as vectors padded with zeroes for
no apparent benefit.  Fix this.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 .../subgroups/test_subgroup_ballot.cpp        | 47 ++++++++-----------
 1 file changed, 19 insertions(+), 28 deletions(-)

diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index 0228e82c2a..ee2c5e511e 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -81,8 +81,8 @@ template <typename Ty> struct BALLOT
                     {
                         log_error(
                             "ERROR: sub_group_ballot mismatch for local id "
-                            "%d in sub group %d in group %d obtained {%d}, "
-                            "expected {%d} \n",
+                            "%d in sub group %d in group %d obtained %d, "
+                            "expected %d\n",
                             wi_id, sb_id, wg_id, device_result,
                             expected_result);
                         return TEST_FAIL;
@@ -455,7 +455,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
         int non_uniform_size = gws % lws;
         int wg_number = gws / lws;
         wg_number = non_uniform_size ? wg_number + 1 : wg_number;
-        cl_uint4 expected_result, device_result;
+        cl_uint expected_result, device_result;
         int last_subgroup_size = 0;
         int current_sbs = 0;
 
@@ -487,7 +487,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                     current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
                 }
                 // Check result
-                expected_result = { 0, 0, 0, 0 };
+                expected_result = 0;
                 for (wi_id = 0; wi_id < current_sbs; ++wi_id)
                 { // for subgroup element
                     bs128 bs;
@@ -497,23 +497,20 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                         | (bs128(mx[wg_offset + wi_id].s2) << 64)
                         | (bs128(mx[wg_offset + wi_id].s3) << 96);
                     bs &= getImportantBits(wi_id, current_sbs);
-                    device_result = my[wg_offset + wi_id];
+                    device_result = my[wg_offset + wi_id].s0;
                     if (operation == BallotOp::ballot_inclusive_scan
                         || operation == BallotOp::ballot_exclusive_scan
                         || operation == BallotOp::ballot_bit_count)
                     {
-                        expected_result.s0 = bs.count();
+                        expected_result = bs.count();
                         if (!compare(device_result, expected_result))
                         {
                             log_error("ERROR: sub_group_%s "
                                       "mismatch for local id %d in sub group "
-                                      "%d in group %d obtained {%d, %d, %d, "
-                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      "%d in group %d obtained %d, "
+                                      "expected %d\n",
                                       operation_names(operation), wi_id, sb_id,
-                                      wg_id, device_result.s0, device_result.s1,
-                                      device_result.s2, device_result.s3,
-                                      expected_result.s0, expected_result.s1,
-                                      expected_result.s2, expected_result.s3);
+                                      wg_id, device_result, expected_result);
                             return TEST_FAIL;
                         }
                     }
@@ -523,7 +520,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                         {
                             if (bs.test(id))
                             {
-                                expected_result.s0 = id;
+                                expected_result = id;
                                 break;
                             }
                         }
@@ -531,13 +528,10 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                         {
                             log_error("ERROR: sub_group_ballot_find_lsb "
                                       "mismatch for local id %d in sub group "
-                                      "%d in group %d obtained {%d, %d, %d, "
-                                      "%d}, expected {%d, %d, %d, %d}\n",
-                                      wi_id, sb_id, wg_id, device_result.s0,
-                                      device_result.s1, device_result.s2,
-                                      device_result.s3, expected_result.s0,
-                                      expected_result.s1, expected_result.s2,
-                                      expected_result.s3);
+                                      "%d in group %d obtained %d, "
+                                      "expected %d\n",
+                                      wi_id, sb_id, wg_id, device_result,
+                                      expected_result);
                             return TEST_FAIL;
                         }
                     }
@@ -547,7 +541,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                         {
                             if (bs.test(id))
                             {
-                                expected_result.s0 = id;
+                                expected_result = id;
                                 break;
                             }
                         }
@@ -555,13 +549,10 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                         {
                             log_error("ERROR: sub_group_ballot_find_msb "
                                       "mismatch for local id %d in sub group "
-                                      "%d in group %d obtained {%d, %d, %d, "
-                                      "%d}, expected {%d, %d, %d, %d}\n",
-                                      wi_id, sb_id, wg_id, device_result.s0,
-                                      device_result.s1, device_result.s2,
-                                      device_result.s3, expected_result.s0,
-                                      expected_result.s1, expected_result.s2,
-                                      expected_result.s3);
+                                      "%d in group %d obtained %d, "
+                                      "expected %d\n",
+                                      wi_id, sb_id, wg_id, device_result,
+                                      expected_result);
                             return TEST_FAIL;
                         }
                     }

From 3cd906aa9b8b96ae0651269c47d6b8cc475c62f5 Mon Sep 17 00:00:00 2001
From: marcat03 <94451804+marcat03@users.noreply.github.com>
Date: Tue, 16 Nov 2021 16:07:43 +0000
Subject: [PATCH 139/158] Fix missing cl_khr_semaphore extensions in compiler
 tests (#1357)

* Added missing extensions related to cl_khr_semaphore

Signed-off-by: Marco Cattani <marco.cattani@arm.com>
---
 .../compiler/test_compiler_defines_for_extensions.cpp          | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index de30e06b20..2f29d39b68 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -75,6 +75,9 @@ const char *known_extensions[] = {
     "cl_khr_pci_bus_info",
     "cl_khr_suggested_local_work_size",
     "cl_khr_spirv_linkonce_odr",
+    "cl_khr_semaphore",
+    "cl_khr_external_semaphore",
+    "cl_khr_external_semaphore_sync_fd",
 };
 
 size_t num_known_extensions = sizeof(known_extensions) / sizeof(char *);

From c25709f3964f1675a03c1a4f1315a09a4386c0bc Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Tue, 23 Nov 2021 14:04:02 -0500
Subject: [PATCH 140/158] Fix stack-use-after-scope crash in conversions
 (#1358)

The way that program sources were being constructed involved capturing
pointers to strings that were allocated on the stack, and then trying
to use them outside of that scope. This change uses a stringstream
defined in the outer scope to build the program instead.
---
 .../conversions/test_conversions.cpp          | 116 ++++++++----------
 1 file changed, 54 insertions(+), 62 deletions(-)

diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index e8e572e667..d489e28a32 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -38,6 +38,7 @@
 #include <sys/param.h>
 #endif
 
+#include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
@@ -1559,84 +1560,40 @@ static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat,
     cl_program program;
     char testName[256];
     int error = 0;
-    const char **strings;
-    size_t stringCount = 0;
+
+    std::ostringstream source;
+    if (outType == kdouble || inType == kdouble)
+        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
 
     // Create the program. This is a bit complicated because we are trying to avoid byte and short stores.
     if (0 == vectorSize)
     {
+        // Create the type names.
         char inName[32];
         char outName[32];
-        const char *programSource[] =
-        {
-            "", // optional pragma
-            "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   dest[i] =  src[i];\n"
-            "}\n"
-        };
-        stringCount = sizeof(programSource) / sizeof(programSource[0]);
-        strings = programSource;
-
-        if (outType == kdouble || inType == kdouble)
-            programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-
-        //create the type name
         strncpy(inName, gTypeNames[inType], sizeof(inName));
         strncpy(outName, gTypeNames[outType], sizeof(outName));
         sprintf(testName, "test_implicit_%s_%s", outName, inName);
-        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType], gTypeNames[outType]);
+
+        source << "__kernel void " << testName << "( __global " << inName
+               << " *src, __global " << outName << " *dest )\n";
+        source << "{\n";
+        source << "   size_t i = get_global_id(0);\n";
+        source << "   dest[i] =  src[i];\n";
+        source << "}\n";
+
+        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
+             gTypeNames[outType]);
         fflush(stdout);
     }
     else
     {
         int vectorSizetmp = vectorSizes[vectorSize];
 
+        // Create the type names.
         char convertString[128];
         char inName[32];
         char outName[32];
-        const char *programSource[] =
-        {
-            "", // optional pragma
-            "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   dest[i] = ", convertString, "( src[i] );\n"
-            "}\n"
-        };
-        const char *programSourceV3[] =
-        {
-            "", // optional pragma
-            "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   if( i + 1 < get_global_size(0))\n"
-            "       vstore3( ", convertString, "( vload3( i, src)), i, dest );\n"
-            "   else\n"
-            "   {\n"
-            "       ", inName, "3 in;\n"
-            "       ", outName, "3 out;\n"
-            "       if( 0 == (i & 1) )\n"
-            "           in.y = src[3*i+1];\n"
-            "       in.x = src[3*i];\n"
-            "       out = ", convertString, "( in ); \n"
-            "       dest[3*i] = out.x;\n"
-            "       if( 0 == (i & 1) )\n"
-            "           dest[3*i+1] = out.y;\n"
-            "   }\n"
-            "}\n"
-        };
-        stringCount = 3 == vectorSizetmp ? sizeof(programSourceV3) / sizeof(programSourceV3[0]) :
-            sizeof(programSource) / sizeof(programSource[0]);
-        strings = 3 == vectorSizetmp ? programSourceV3 : programSource;
-
-        if (outType == kdouble || inType == kdouble) {
-            programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-            programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-        }
-
-        //create the type name
         switch (vectorSizetmp)
         {
         case 1:
@@ -1661,8 +1618,40 @@ static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat,
             vlog("Building %s( %s ) test\n", convertString, inName);
             break;
         }
-
         fflush(stdout);
+
+        if (vectorSizetmp == 3)
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   if( i + 1 < get_global_size(0))\n";
+            source << "       vstore3( " << convertString
+                   << "( vload3( i, src)), i, dest );\n";
+            source << "   else\n";
+            source << "   {\n";
+            source << "       " << inName << "3 in;\n";
+            source << "       " << outName << "3 out;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           in.y = src[3*i+1];\n";
+            source << "       in.x = src[3*i];\n";
+            source << "       out = " << convertString << "( in ); \n";
+            source << "       dest[3*i] = out.x;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           dest[3*i+1] = out.y;\n";
+            source << "   }\n";
+            source << "}\n";
+        }
+        else
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   dest[i] = " << convertString << "( src[i] );\n";
+            source << "}\n";
+        }
     }
     *outKernel = NULL;
 
@@ -1671,7 +1660,10 @@ static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat,
         flags = "-cl-denorms-are-zero";
 
     // build it
-    error = create_single_kernel_helper(gContext, &program, outKernel, (cl_uint)stringCount, strings, testName, flags);
+    std::string sourceString = source.str();
+    const char *programSource = sourceString.c_str();
+    error = create_single_kernel_helper(gContext, &program, outKernel, 1,
+                                        &programSource, testName, flags);
     if (error)
     {
         char    buffer[2048] = "";

From 3eb0f50d85df0350af29f5f1dbbf5a3ddef906b3 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Thu, 25 Nov 2021 13:36:20 +0000
Subject: [PATCH 141/158] Use maximum subgroup size in sub_group_ballot tests
 (#1344)

sub_group_ballot_bit_count() and sub_group_ballot_find_msb() mask
their input according to a subgroup size, which is assumed to be the
maximum subgroup size, and not the actual subgroup size excluding
non-existent work-items in the "remainder" subgroup.

Fix this as per the the clarification made to the OpenCL C specification
in revision 3.0.9 for issue KhronosGroup/OpenCL-Docs#626 by pull request
KhronosGroup/OpenCL-Docs#689.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_conformance/subgroups/test_subgroup_ballot.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index ee2c5e511e..f362a501e5 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -496,7 +496,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                         | (bs128(mx[wg_offset + wi_id].s1) << 32)
                         | (bs128(mx[wg_offset + wi_id].s2) << 64)
                         | (bs128(mx[wg_offset + wi_id].s3) << 96);
-                    bs &= getImportantBits(wi_id, current_sbs);
+                    bs &= getImportantBits(wi_id, sbs);
                     device_result = my[wg_offset + wi_id].s0;
                     if (operation == BallotOp::ballot_inclusive_scan
                         || operation == BallotOp::ballot_exclusive_scan
@@ -516,7 +516,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                     }
                     else if (operation == BallotOp::ballot_find_lsb)
                     {
-                        for (int id = 0; id < current_sbs; ++id)
+                        for (int id = 0; id < sbs; ++id)
                         {
                             if (bs.test(id))
                             {
@@ -537,7 +537,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
                     }
                     else if (operation == BallotOp::ballot_find_msb)
                     {
-                        for (int id = current_sbs - 1; id >= 0; --id)
+                        for (int id = sbs - 1; id >= 0; --id)
                         {
                             if (bs.test(id))
                             {

From 6dff4fdffadff59c42083bd2f685598613c30091 Mon Sep 17 00:00:00 2001
From: BKoscielak <bartosz.koscielak@intel.com>
Date: Thu, 25 Nov 2021 14:40:19 +0100
Subject: [PATCH 142/158] Fix conversion data loss in test_api
 min_max_constant_args (#1355)

---
 test_conformance/api/test_api_min_max.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/api/test_api_min_max.cpp b/test_conformance/api/test_api_min_max.cpp
index 9e981cd3f3..8d132fe60a 100644
--- a/test_conformance/api/test_api_min_max.cpp
+++ b/test_conformance/api/test_api_min_max.cpp
@@ -1489,7 +1489,7 @@ int test_min_max_constant_args(cl_device_id deviceID, cl_context context, cl_com
 
     error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
     test_error( error, "Unable to get max constant buffer size" );
-    individualBufferSize = ((int)maxSize/2)/maxArgs;
+    individualBufferSize = (maxSize / 2) / maxArgs;
 
     log_info("Reported max constant arg count of %d and max constant buffer size of %d. Test will attempt to allocate half of that, or %d buffers of size %d.\n",
              (int)maxArgs, (int)maxSize, (int)maxArgs, (int)individualBufferSize);

From 6f50623ba867ee5a847464e15937b1a9bda3506c Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Thu, 25 Nov 2021 14:41:06 +0100
Subject: [PATCH 143/158] Subgroups tests -
 sub_group_non_uniform_scan_exclusive function fixes (#1350)

* Fix - comparing results will never happen.

* No special action needed for one work item in the subgroup
---
 test_conformance/subgroups/subgroup_common_templates.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index cfe02c2f40..64b4b9718d 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -630,19 +630,12 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
                 {
                     continue;
                 }
-                else if (active_work_items.size() == 1)
-                {
-                    continue;
-                }
                 else
                 {
                     tr = TypeManager<Ty>::identify_limits(operation);
-                    int idx = 0;
                     for (const int &active_work_item : active_work_items)
                     {
                         rr = my[ii + active_work_item];
-                        if (idx == 0) continue;
-
                         if (!compare_ordered(rr, tr))
                         {
                             log_error(
@@ -655,7 +648,6 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
                         }
                         tr = calculate<Ty>(tr, mx[ii + active_work_item],
                                            operation);
-                        idx++;
                     }
                 }
             }

From 7625011b666c1a7c1fee5818309e9ed3d658a899 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Fri, 26 Nov 2021 15:30:23 +0000
Subject: [PATCH 144/158] Remove unused inclusion of <cstdio> (#1362)

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 .../subgroups/test_subgroup_non_uniform_arithmetic.cpp           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
index bb257bcdf1..5ab4522268 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -17,7 +17,6 @@
 #include "subhelpers.h"
 #include "harness/typeWrappers.h"
 #include "subgroup_common_templates.h"
-#include <cstdio>
 
 namespace {
 

From f8ec235d3c1555fbfaa7eea6bf5f3b588de1b03f Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Thu, 2 Dec 2021 15:27:30 +0000
Subject: [PATCH 145/158] Tidy up code to determine bit mask for ballot scans
 (#1363)

It seems more intuitive to set only the bits that are required, rather
than to set one more bit than is required, only to clear it again.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_conformance/subgroups/test_subgroup_ballot.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index f362a501e5..e742aa3b64 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -437,9 +437,9 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
         else if (operation == BallotOp::ballot_inclusive_scan
                  || operation == BallotOp::ballot_exclusive_scan)
         {
-            for (cl_uint i = 0; i <= sub_group_local_id; ++i) mask.set(i);
-            if (operation == BallotOp::ballot_exclusive_scan)
-                mask.reset(sub_group_local_id);
+            for (cl_uint i = 0; i < sub_group_local_id; ++i) mask.set(i);
+            if (operation == BallotOp::ballot_inclusive_scan)
+                mask.set(sub_group_local_id);
         }
         return mask;
     }

From e106be14f9d21a13d485c8256da6cccb933850cd Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Sat, 4 Dec 2021 18:55:17 +0100
Subject: [PATCH 146/158] Test api min max - fix printing cl_ulong data type
 (#1212)

* test api - fix code formatting only

* Fix printing cl_ulong type to avoid overloading.

* Fix printing size_t data type

* Fix printing size_t data type - set unsinged

* Fix formatting for maxArgs (uint) and numberOfInts (size_t)
---
 test_conformance/api/test_api_min_max.cpp | 1746 +++++++++++++--------
 1 file changed, 1087 insertions(+), 659 deletions(-)

diff --git a/test_conformance/api/test_api_min_max.cpp b/test_conformance/api/test_api_min_max.cpp
index 8d132fe60a..28ca823776 100644
--- a/test_conformance/api/test_api_min_max.cpp
+++ b/test_conformance/api/test_api_min_max.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -24,7 +24,8 @@ const char *sample_single_param_kernel[] = {
     "{\n"
     "    int  tid = get_global_id(0);\n"
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_single_param_write_kernel[] = {
     "__kernel void sample_test(__global int *src)\n"
@@ -32,23 +33,29 @@ const char *sample_single_param_write_kernel[] = {
     "    int  tid = get_global_id(0);\n"
     "     src[tid] = tid;\n"
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_read_image_kernel_pattern[] = {
-    "__kernel void sample_test( __global float *result, ",  " )\n"
+    "__kernel void sample_test( __global float *result, ",
+    " )\n"
     "{\n"
-    "  sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+    "  sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | "
+    "CLK_FILTER_NEAREST;\n"
     "    int  tid = get_global_id(0);\n"
     "    result[0] = 0.0f;\n",
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_write_image_kernel_pattern[] = {
-    "__kernel void sample_test( ",  " )\n"
+    "__kernel void sample_test( ",
+    " )\n"
     "{\n"
     "    int  tid = get_global_id(0);\n",
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 
 const char *sample_large_parmam_kernel_pattern[] = {
@@ -57,7 +64,8 @@ const char *sample_large_parmam_kernel_pattern[] = {
     "result[0] = 0;\n"
     "%s"
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_large_int_parmam_kernel_pattern[] = {
     "__kernel void sample_test(%s, __global int *result)\n"
@@ -65,15 +73,19 @@ const char *sample_large_int_parmam_kernel_pattern[] = {
     "result[0] = 0;\n"
     "%s"
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_sampler_kernel_pattern[] = {
-    "__kernel void sample_test( read_only image2d_t src, __global int4 *dst", ", sampler_t sampler%d", ")\n"
+    "__kernel void sample_test( read_only image2d_t src, __global int4 *dst",
+    ", sampler_t sampler%d",
+    ")\n"
     "{\n"
     "    int  tid = get_global_id(0);\n",
     "     dst[ 0 ] = read_imagei( src, sampler%d, (int2)( 0, 0 ) );\n",
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_const_arg_kernel[] = {
     "__kernel void sample_test(__constant int *src1, __global int *dst)\n"
@@ -82,10 +94,12 @@ const char *sample_const_arg_kernel[] = {
     "\n"
     "    dst[tid] = src1[tid];\n"
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_local_arg_kernel[] = {
-    "__kernel void sample_test(__local int *src1, __global int *global_src, __global int *dst)\n"
+    "__kernel void sample_test(__local int *src1, __global int *global_src, "
+    "__global int *dst)\n"
     "{\n"
     "    int  tid = get_global_id(0);\n"
     "\n"
@@ -93,19 +107,21 @@ const char *sample_local_arg_kernel[] = {
     "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
     "    dst[tid] = src1[tid];\n"
     "\n"
-    "}\n" };
+    "}\n"
+};
 
 const char *sample_const_max_arg_kernel_pattern =
-"__kernel void sample_test(__constant int *src1 %s, __global int *dst)\n"
-"{\n"
-"    int  tid = get_global_id(0);\n"
-"\n"
-"    dst[tid] = src1[tid];\n"
-"%s"
-"\n"
-"}\n";
-
-int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+    "__kernel void sample_test(__constant int *src1 %s, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = src1[tid];\n"
+    "%s"
+    "\n"
+    "}\n";
+
+int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
 {
     int error, retVal;
     unsigned int maxThreadDim, threadDim, i;
@@ -118,19 +134,24 @@ int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl
 
 
     /* Get the max thread dimensions */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( maxThreadDim ), &maxThreadDim, NULL );
-    test_error( error, "Unable to get max work item dimensions from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                            sizeof(maxThreadDim), &maxThreadDim, NULL);
+    test_error(error, "Unable to get max work item dimensions from device");
 
-    if( maxThreadDim < 3 )
+    if (maxThreadDim < 3)
     {
-        log_error( "ERROR: Reported max work item dimensions is less than required! (%d)\n", maxThreadDim );
+        log_error("ERROR: Reported max work item dimensions is less than "
+                  "required! (%d)\n",
+                  maxThreadDim);
         return -1;
     }
 
     log_info("Reported max thread dimensions of %d.\n", maxThreadDim);
 
     /* Create a kernel to test with */
-    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_single_param_kernel, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    sample_single_param_kernel, "sample_test")
+        != 0)
     {
         return -1;
     }
@@ -138,105 +159,122 @@ int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl
     /* Create some I/O streams */
     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_int) * 100, NULL, &error);
-    if( streams[0] == NULL )
+    if (streams[0] == NULL)
     {
         log_error("ERROR: Creating test array failed!\n");
         return -1;
     }
 
     /* Set the arguments */
-    error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel arguments");
 
     retVal = 0;
 
     /* Now try running the kernel with up to that many threads */
-    for (threadDim=1; threadDim <= maxThreadDim; threadDim++)
+    for (threadDim = 1; threadDim <= maxThreadDim; threadDim++)
     {
-        threads = (size_t *)malloc( sizeof( size_t ) * maxThreadDim );
-        localThreads = (size_t *)malloc( sizeof( size_t ) * maxThreadDim );
-        for( i = 0; i < maxThreadDim; i++ )
+        threads = (size_t *)malloc(sizeof(size_t) * maxThreadDim);
+        localThreads = (size_t *)malloc(sizeof(size_t) * maxThreadDim);
+        for (i = 0; i < maxThreadDim; i++)
         {
-            threads[ i ] = 1;
+            threads[i] = 1;
             localThreads[i] = 1;
         }
 
-        error = clEnqueueNDRangeKernel( queue, kernel, maxThreadDim, NULL, threads, localThreads, 0, NULL, &event );
-        test_error( error, "Failed clEnqueueNDRangeKernel");
+        error = clEnqueueNDRangeKernel(queue, kernel, maxThreadDim, NULL,
+                                       threads, localThreads, 0, NULL, &event);
+        test_error(error, "Failed clEnqueueNDRangeKernel");
 
         // Verify that the event does not return an error from the execution
         error = clWaitForEvents(1, &event);
-        test_error( error, "clWaitForEvent failed");
-        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-        test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+        test_error(error, "clWaitForEvent failed");
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(event_status), &event_status, NULL);
+        test_error(
+            error,
+            "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
         clReleaseEvent(event);
         if (event_status < 0)
             test_error(error, "Kernel execution event returned error");
 
         /* All done */
-        free( threads );
-        free( localThreads );
+        free(threads);
+        free(localThreads);
     }
 
     return retVal;
 }
 
 
-int test_min_max_work_items_sizes(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_work_items_sizes(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     int error;
     size_t *deviceMaxWorkItemSize;
     unsigned int maxWorkItemDim;
 
     /* Get the max work item dimensions */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( maxWorkItemDim ), &maxWorkItemDim, NULL );
-    test_error( error, "Unable to get max work item dimensions from device" );
-
-    log_info("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS returned %d\n", maxWorkItemDim);
-    deviceMaxWorkItemSize = (size_t*)malloc(sizeof(size_t)*maxWorkItemDim);
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxWorkItemDim, deviceMaxWorkItemSize, NULL );
-    test_error( error, "clDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_SIZES failed" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                            sizeof(maxWorkItemDim), &maxWorkItemDim, NULL);
+    test_error(error, "Unable to get max work item dimensions from device");
+
+    log_info("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS returned %d\n",
+             maxWorkItemDim);
+    deviceMaxWorkItemSize = (size_t *)malloc(sizeof(size_t) * maxWorkItemDim);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(size_t) * maxWorkItemDim,
+                            deviceMaxWorkItemSize, NULL);
+    test_error(error, "clDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_SIZES failed");
 
     unsigned int i;
     int errors = 0;
-    for(i=0; i<maxWorkItemDim; i++) {
-        if (deviceMaxWorkItemSize[i]<1) {
-            log_error("MAX_WORK_ITEM_SIZE in dimension %d is invalid: %lu\n", i, deviceMaxWorkItemSize[i]);
+    for (i = 0; i < maxWorkItemDim; i++)
+    {
+        if (deviceMaxWorkItemSize[i] < 1)
+        {
+            log_error("MAX_WORK_ITEM_SIZE in dimension %d is invalid: %lu\n", i,
+                      deviceMaxWorkItemSize[i]);
             errors++;
-        } else {
-            log_info("Dimension %d has max work item size %lu\n", i, deviceMaxWorkItemSize[i]);
+        }
+        else
+        {
+            log_info("Dimension %d has max work item size %lu\n", i,
+                     deviceMaxWorkItemSize[i]);
         }
     }
 
     free(deviceMaxWorkItemSize);
 
-    if (errors)
-        return -1;
+    if (errors) return -1;
     return 0;
 }
 
 
-
-int test_min_max_work_group_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_work_group_size(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
     int error;
     size_t deviceMaxThreadSize;
 
     /* Get the max thread dimensions */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( deviceMaxThreadSize ), &deviceMaxThreadSize, NULL );
-    test_error( error, "Unable to get max work group size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(deviceMaxThreadSize), &deviceMaxThreadSize,
+                            NULL);
+    test_error(error, "Unable to get max work group size from device");
 
     log_info("Reported %ld max device work group size.\n", deviceMaxThreadSize);
 
-    if( deviceMaxThreadSize == 0 )
+    if (deviceMaxThreadSize == 0)
     {
-        log_error( "ERROR: Max work group size is reported as zero!\n" );
+        log_error("ERROR: Max work group size is reported as zero!\n");
         return -1;
     }
     return 0;
 }
 
-int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_read_image_args(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
     int error;
     unsigned int maxReadImages, i;
@@ -245,48 +283,55 @@ int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_c
     char readArgLine[128], *programSrc;
     const char *readArgPattern = ", read_only image2d_t srcimg%d";
     clKernelWrapper kernel;
-    clMemWrapper    *streams, result;
+    clMemWrapper *streams, result;
     size_t threads[2];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     size_t maxParameterSize;
     cl_event event;
     cl_int event_status;
-    cl_float image_data[4*4];
+    cl_float image_data[4 * 4];
     float image_result = 0.0f;
     float actual_image_result;
     cl_uint minRequiredReadImages = gIsEmbedded ? 8 : 128;
     cl_device_type deviceType;
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
     image_format_desc.image_channel_order = CL_RGBA;
     image_format_desc.image_channel_data_type = CL_FLOAT;
 
     /* Get the max read image arg count */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof( maxReadImages ), &maxReadImages, NULL );
-    test_error( error, "Unable to get max read image arg count from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_READ_IMAGE_ARGS,
+                            sizeof(maxReadImages), &maxReadImages, NULL);
+    test_error(error, "Unable to get max read image arg count from device");
 
-    if( maxReadImages < minRequiredReadImages )
+    if (maxReadImages < minRequiredReadImages)
     {
-        log_error( "ERROR: Reported max read image arg count is less than required! (%d)\n", maxReadImages );
+        log_error("ERROR: Reported max read image arg count is less than "
+                  "required! (%d)\n",
+                  maxReadImages);
         return -1;
     }
 
     log_info("Reported %d max read image args.\n", maxReadImages);
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_ADDRESS_BITS, sizeof( deviceAddressSize ), &deviceAddressSize, NULL );
-    test_error( error, "Unable to query CL_DEVICE_ADDRESS_BITS for device" );
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS,
+                        sizeof(deviceAddressSize), &deviceAddressSize, NULL);
+    test_error(error, "Unable to query CL_DEVICE_ADDRESS_BITS for device");
     deviceAddressSize /= 8; // convert from bits to bytes
 
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
 
     if (!gIsEmbedded && maxReadImages >= 128 && maxParameterSize == 1024)
     {
-        error = clGetDeviceInfo( deviceID, CL_DEVICE_TYPE, sizeof( deviceType ), &deviceType, NULL );
-        test_error( error, "Unable to get device type from device" );
+        error = clGetDeviceInfo(deviceID, CL_DEVICE_TYPE, sizeof(deviceType),
+                                &deviceType, NULL);
+        test_error(error, "Unable to get device type from device");
 
-        if(deviceType != CL_DEVICE_TYPE_CUSTOM)
+        if (deviceType != CL_DEVICE_TYPE_CUSTOM)
         {
             maxReadImages = 127;
         }
@@ -295,85 +340,107 @@ int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_c
     maxParameterSize -= deviceAddressSize;
 
     // Calculate the number we can use
-    if (maxParameterSize/deviceAddressSize < maxReadImages) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/deviceAddressSize));
-        maxReadImages = (unsigned int)(maxParameterSize/deviceAddressSize);
+    if (maxParameterSize / deviceAddressSize < maxReadImages)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max image arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / deviceAddressSize));
+        maxReadImages = (unsigned int)(maxParameterSize / deviceAddressSize);
     }
 
     /* Create a program with that many read args */
-    programSrc = (char *)malloc( strlen( sample_read_image_kernel_pattern[ 0 ] ) + ( strlen( readArgPattern ) + 6 ) * ( maxReadImages ) +
-                                strlen( sample_read_image_kernel_pattern[ 1 ] ) + 1 + 40240);
+    programSrc = (char *)malloc(strlen(sample_read_image_kernel_pattern[0])
+                                + (strlen(readArgPattern) + 6) * (maxReadImages)
+                                + strlen(sample_read_image_kernel_pattern[1])
+                                + 1 + 40240);
 
-    strcpy( programSrc, sample_read_image_kernel_pattern[ 0 ] );
-    strcat( programSrc, "read_only image2d_t srcimg0" );
-    for( i = 0; i < maxReadImages-1; i++ )
+    strcpy(programSrc, sample_read_image_kernel_pattern[0]);
+    strcat(programSrc, "read_only image2d_t srcimg0");
+    for (i = 0; i < maxReadImages - 1; i++)
     {
-        sprintf( readArgLine, readArgPattern, i+1 );
-        strcat( programSrc, readArgLine );
+        sprintf(readArgLine, readArgPattern, i + 1);
+        strcat(programSrc, readArgLine);
     }
-    strcat( programSrc, sample_read_image_kernel_pattern[ 1 ] );
-    for ( i = 0; i < maxReadImages; i++) {
-        sprintf( readArgLine, "\tresult[0] += read_imagef( srcimg%d, sampler, (int2)(0,0)).x;\n", i);
-        strcat( programSrc, readArgLine );
+    strcat(programSrc, sample_read_image_kernel_pattern[1]);
+    for (i = 0; i < maxReadImages; i++)
+    {
+        sprintf(
+            readArgLine,
+            "\tresult[0] += read_imagef( srcimg%d, sampler, (int2)(0,0)).x;\n",
+            i);
+        strcat(programSrc, readArgLine);
     }
-    strcat( programSrc, sample_read_image_kernel_pattern[ 2 ] );
+    strcat(programSrc, sample_read_image_kernel_pattern[2]);
 
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test");
-    test_error( error, "Failed to create the program and kernel.");
-    free( programSrc );
+    error =
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programSrc, "sample_test");
+    test_error(error, "Failed to create the program and kernel.");
+    free(programSrc);
 
     result = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float), NULL,
                             &error);
-    test_error( error, "clCreateBufer failed");
+    test_error(error, "clCreateBufer failed");
 
     /* Create some I/O streams */
     streams = new clMemWrapper[maxReadImages + 1];
-    for( i = 0; i < maxReadImages; i++ )
+    for (i = 0; i < maxReadImages; i++)
     {
-        image_data[0]=i;
-        image_result+= image_data[0];
-        streams[i] = create_image_2d( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &image_format_desc, 4, 4, 0, image_data, &error );
-        test_error( error, "Unable to allocate test image" );
+        image_data[0] = i;
+        image_result += image_data[0];
+        streams[i] =
+            create_image_2d(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                            &image_format_desc, 4, 4, 0, image_data, &error);
+        test_error(error, "Unable to allocate test image");
     }
 
-    error = clSetKernelArg( kernel, 0, sizeof( result ), &result );
-    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof(result), &result);
+    test_error(error, "Unable to set kernel arguments");
 
     /* Set the arguments */
-    for( i = 1; i < maxReadImages+1; i++ )
+    for (i = 1; i < maxReadImages + 1; i++)
     {
-        error = clSetKernelArg( kernel, i, sizeof( streams[i-1] ), &streams[i-1] );
-        test_error( error, "Unable to set kernel arguments" );
+        error =
+            clSetKernelArg(kernel, i, sizeof(streams[i - 1]), &streams[i - 1]);
+        test_error(error, "Unable to set kernel arguments");
     }
 
     /* Now try running the kernel */
     threads[0] = threads[1] = 1;
-    error = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, &event );
-    test_error( error, "clEnqueueNDRangeKernel failed");
+    error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, threads, NULL, 0,
+                                   NULL, &event);
+    test_error(error, "clEnqueueNDRangeKernel failed");
 
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     clReleaseEvent(event);
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
 
-    error = clEnqueueReadBuffer(queue, result, CL_TRUE, 0, sizeof(cl_float), &actual_image_result, 0, NULL, NULL);
+    error = clEnqueueReadBuffer(queue, result, CL_TRUE, 0, sizeof(cl_float),
+                                &actual_image_result, 0, NULL, NULL);
     test_error(error, "clEnqueueReadBuffer failed");
 
     delete[] streams;
 
-    if (actual_image_result != image_result) {
-        log_error("Result failed to verify. Got %g, expected %g.\n", actual_image_result, image_result);
+    if (actual_image_result != image_result)
+    {
+        log_error("Result failed to verify. Got %g, expected %g.\n",
+                  actual_image_result, image_result);
         return 1;
     }
 
     return 0;
 }
 
-int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_write_image_args(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     int error;
     unsigned int maxWriteImages, i;
@@ -381,94 +448,117 @@ int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_
     char writeArgLine[128], *programSrc;
     const char *writeArgPattern = ", write_only image2d_t dstimg%d";
     clKernelWrapper kernel;
-    clMemWrapper    *streams;
+    clMemWrapper *streams;
     size_t threads[2];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     size_t maxParameterSize;
     cl_event event;
     cl_int event_status;
     cl_uint minRequiredWriteImages = gIsEmbedded ? 1 : 8;
 
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
     image_format_desc.image_channel_order = CL_RGBA;
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
 
     /* Get the max read image arg count */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof( maxWriteImages ), &maxWriteImages, NULL );
-    test_error( error, "Unable to get max write image arg count from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WRITE_IMAGE_ARGS,
+                            sizeof(maxWriteImages), &maxWriteImages, NULL);
+    test_error(error, "Unable to get max write image arg count from device");
 
-    if( maxWriteImages == 0 )
+    if (maxWriteImages == 0)
     {
-        log_info( "WARNING: Device reports 0 for a max write image arg count (write image arguments unsupported). Skipping test (implicitly passes). This is only valid if the number of image formats is also 0.\n" );
+        log_info(
+            "WARNING: Device reports 0 for a max write image arg count (write "
+            "image arguments unsupported). Skipping test (implicitly passes). "
+            "This is only valid if the number of image formats is also 0.\n");
         return 0;
     }
 
-    if( maxWriteImages < minRequiredWriteImages )
+    if (maxWriteImages < minRequiredWriteImages)
     {
-        log_error( "ERROR: Reported max write image arg count is less than required! (%d)\n", maxWriteImages );
+        log_error("ERROR: Reported max write image arg count is less than "
+                  "required! (%d)\n",
+                  maxWriteImages);
         return -1;
     }
 
     log_info("Reported %d max write image args.\n", maxWriteImages);
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
 
     // Calculate the number we can use
-    if (maxParameterSize/sizeof(cl_mem) < maxWriteImages) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_mem)));
-        maxWriteImages = (unsigned int)(maxParameterSize/sizeof(cl_mem));
+    if (maxParameterSize / sizeof(cl_mem) < maxWriteImages)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max image arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / sizeof(cl_mem)));
+        maxWriteImages = (unsigned int)(maxParameterSize / sizeof(cl_mem));
     }
 
     /* Create a program with that many write args + 1 */
-    programSrc = (char *)malloc( strlen( sample_write_image_kernel_pattern[ 0 ] ) + ( strlen( writeArgPattern ) + 6 ) * ( maxWriteImages + 1 ) +
-                                strlen( sample_write_image_kernel_pattern[ 1 ] ) + 1 + 40240 );
+    programSrc = (char *)malloc(
+        strlen(sample_write_image_kernel_pattern[0])
+        + (strlen(writeArgPattern) + 6) * (maxWriteImages + 1)
+        + strlen(sample_write_image_kernel_pattern[1]) + 1 + 40240);
 
-    strcpy( programSrc, sample_write_image_kernel_pattern[ 0 ] );
-    strcat( programSrc, "write_only image2d_t dstimg0" );
-    for( i = 1; i < maxWriteImages; i++ )
+    strcpy(programSrc, sample_write_image_kernel_pattern[0]);
+    strcat(programSrc, "write_only image2d_t dstimg0");
+    for (i = 1; i < maxWriteImages; i++)
     {
-        sprintf( writeArgLine, writeArgPattern, i );
-        strcat( programSrc, writeArgLine );
+        sprintf(writeArgLine, writeArgPattern, i);
+        strcat(programSrc, writeArgLine);
     }
-    strcat( programSrc, sample_write_image_kernel_pattern[ 1 ] );
-    for ( i = 0; i < maxWriteImages; i++) {
-        sprintf( writeArgLine, "\twrite_imagef( dstimg%d, (int2)(0,0), (float4)(0,0,0,0));\n", i);
-        strcat( programSrc, writeArgLine );
+    strcat(programSrc, sample_write_image_kernel_pattern[1]);
+    for (i = 0; i < maxWriteImages; i++)
+    {
+        sprintf(writeArgLine,
+                "\twrite_imagef( dstimg%d, (int2)(0,0), (float4)(0,0,0,0));\n",
+                i);
+        strcat(programSrc, writeArgLine);
     }
-    strcat( programSrc, sample_write_image_kernel_pattern[ 2 ] );
+    strcat(programSrc, sample_write_image_kernel_pattern[2]);
 
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test");
-    test_error( error, "Failed to create the program and kernel.");
-    free( programSrc );
+    error =
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programSrc, "sample_test");
+    test_error(error, "Failed to create the program and kernel.");
+    free(programSrc);
 
 
     /* Create some I/O streams */
     streams = new clMemWrapper[maxWriteImages + 1];
-    for( i = 0; i < maxWriteImages; i++ )
+    for (i = 0; i < maxWriteImages; i++)
     {
-        streams[i] = create_image_2d( context, CL_MEM_READ_WRITE, &image_format_desc, 16, 16, 0, NULL, &error );
-        test_error( error, "Unable to allocate test image" );
+        streams[i] =
+            create_image_2d(context, CL_MEM_READ_WRITE, &image_format_desc, 16,
+                            16, 0, NULL, &error);
+        test_error(error, "Unable to allocate test image");
     }
 
     /* Set the arguments */
-    for( i = 0; i < maxWriteImages; i++ )
+    for (i = 0; i < maxWriteImages; i++)
     {
-        error = clSetKernelArg( kernel, i, sizeof( streams[i] ), &streams[i] );
-        test_error( error, "Unable to set kernel arguments" );
+        error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+        test_error(error, "Unable to set kernel arguments");
     }
 
     /* Now try running the kernel */
     threads[0] = threads[1] = 16;
-    error = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, &event );
-    test_error( error, "clEnqueueNDRangeKernel failed.");
+    error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, threads, NULL, 0,
+                                   NULL, &event);
+    test_error(error, "clEnqueueNDRangeKernel failed.");
 
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     clReleaseEvent(event);
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
@@ -478,7 +568,8 @@ int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_
     return 0;
 }
 
-int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
     int error;
     cl_ulong maxAllocSize, memSize, minSizeToTry;
@@ -492,61 +583,89 @@ int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context, cl_co
         requiredAllocSize = 128 * 1024 * 1024;
 
     /* Get the max mem alloc size */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get max mem alloc size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get max mem alloc size from device");
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( memSize ), &memSize, NULL );
-    test_error( error, "Unable to get global memory size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(memSize), &memSize, NULL);
+    test_error(error, "Unable to get global memory size from device");
 
-    if (memSize > (cl_ulong)SIZE_MAX) {
-      memSize = (cl_ulong)SIZE_MAX;
+    if (memSize > (cl_ulong)SIZE_MAX)
+    {
+        memSize = (cl_ulong)SIZE_MAX;
     }
 
-    if( maxAllocSize < requiredAllocSize)
+    if (maxAllocSize < requiredAllocSize)
     {
-        log_error( "ERROR: Reported max allocation size is less than required %lldMB! (%llu or %lluMB, from a total mem size of %lldMB)\n", (requiredAllocSize / 1024) / 1024, maxAllocSize, (maxAllocSize / 1024)/1024, (memSize / 1024)/1024 );
+        log_error("ERROR: Reported max allocation size is less than required "
+                  "%lldMB! (%llu or %lluMB, from a total mem size of %lldMB)\n",
+                  (requiredAllocSize / 1024) / 1024, maxAllocSize,
+                  (maxAllocSize / 1024) / 1024, (memSize / 1024) / 1024);
         return -1;
     }
 
-    requiredAllocSize = ((memSize / 4) > (1024 * 1024 * 1024)) ? 1024 * 1024 * 1024 : memSize / 4;
+    requiredAllocSize = ((memSize / 4) > (1024 * 1024 * 1024))
+        ? 1024 * 1024 * 1024
+        : memSize / 4;
 
     if (gIsEmbedded)
-        requiredAllocSize = (requiredAllocSize < 1 * 1024 * 1024) ? 1 * 1024 * 1024 : requiredAllocSize;
+        requiredAllocSize = (requiredAllocSize < 1 * 1024 * 1024)
+            ? 1 * 1024 * 1024
+            : requiredAllocSize;
     else
-    requiredAllocSize = (requiredAllocSize < 128 * 1024 * 1024) ? 128 * 1024 * 1024 : requiredAllocSize;
+        requiredAllocSize = (requiredAllocSize < 128 * 1024 * 1024)
+            ? 128 * 1024 * 1024
+            : requiredAllocSize;
 
-    if( maxAllocSize < requiredAllocSize )
+    if (maxAllocSize < requiredAllocSize)
     {
-        log_error( "ERROR: Reported max allocation size is less than required of total memory! (%llu or %lluMB, from a total mem size of %lluMB)\n", maxAllocSize, (maxAllocSize / 1024)/1024, (requiredAllocSize / 1024)/1024 );
+        log_error(
+            "ERROR: Reported max allocation size is less than required of "
+            "total memory! (%llu or %lluMB, from a total mem size of %lluMB)\n",
+            maxAllocSize, (maxAllocSize / 1024) / 1024,
+            (requiredAllocSize / 1024) / 1024);
         return -1;
     }
 
-    log_info("Reported max allocation size of %lld bytes (%gMB) and global mem size of %lld bytes (%gMB).\n",
-             maxAllocSize, maxAllocSize/(1024.0*1024.0), requiredAllocSize, requiredAllocSize/(1024.0*1024.0));
+    log_info("Reported max allocation size of %lld bytes (%gMB) and global mem "
+             "size of %lld bytes (%gMB).\n",
+             maxAllocSize, maxAllocSize / (1024.0 * 1024.0), requiredAllocSize,
+             requiredAllocSize / (1024.0 * 1024.0));
 
-    if ( memSize < maxAllocSize ) {
-        log_info("Global memory size is less than max allocation size, using that.\n");
+    if (memSize < maxAllocSize)
+    {
+        log_info("Global memory size is less than max allocation size, using "
+                 "that.\n");
         maxAllocSize = memSize;
     }
 
-    minSizeToTry = maxAllocSize/16;
-    while (maxAllocSize > (maxAllocSize/4)) {
+    minSizeToTry = maxAllocSize / 16;
+    while (maxAllocSize > (maxAllocSize / 4))
+    {
 
-        log_info("Trying to create a buffer of size of %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0));
-        memHdl = clCreateBuffer( context, CL_MEM_READ_ONLY, (size_t)maxAllocSize, NULL, &error );
-        if (error == CL_MEM_OBJECT_ALLOCATION_FAILURE || error == CL_OUT_OF_RESOURCES || error == CL_OUT_OF_HOST_MEMORY) {
-            log_info("\tAllocation failed at size of %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0));
+        log_info("Trying to create a buffer of size of %lld bytes (%gMB).\n",
+                 maxAllocSize, (double)maxAllocSize / (1024.0 * 1024.0));
+        memHdl = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)maxAllocSize,
+                                NULL, &error);
+        if (error == CL_MEM_OBJECT_ALLOCATION_FAILURE
+            || error == CL_OUT_OF_RESOURCES || error == CL_OUT_OF_HOST_MEMORY)
+        {
+            log_info("\tAllocation failed at size of %lld bytes (%gMB).\n",
+                     maxAllocSize, (double)maxAllocSize / (1024.0 * 1024.0));
             maxAllocSize -= minSizeToTry;
             continue;
         }
-        test_error( error, "clCreateBuffer failed for maximum sized buffer.");
+        test_error(error, "clCreateBuffer failed for maximum sized buffer.");
         return 0;
     }
-    log_error("Failed to allocate even %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0));
+    log_error("Failed to allocate even %lld bytes (%gMB).\n", maxAllocSize,
+              (double)maxAllocSize / (1024.0 * 1024.0));
     return -1;
 }
 
-int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
     int error;
     size_t maxDimension;
@@ -557,7 +676,7 @@ int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_co
     size_t length;
 
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
 
     auto version = get_device_cl_version(deviceID);
     if (version == Version(1, 0))
@@ -571,16 +690,20 @@ int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_co
 
 
     /* Just get any ol format to test with */
-    error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D, CL_MEM_READ_WRITE, 0, &image_format_desc );
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D,
+                                   CL_MEM_READ_WRITE, 0, &image_format_desc);
+    test_error(error, "Unable to obtain suitable image format to test with!");
 
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 2d width from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 2d width from device");
 
-    if( maxDimension < minRequiredDimension )
+    if (maxDimension < minRequiredDimension)
     {
-        log_error( "ERROR: Reported max image 2d width is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 2d width is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     }
     log_info("Max reported width is %ld.\n", maxDimension);
@@ -588,34 +711,42 @@ int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_co
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE2D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     }
 
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
     }
 
-    log_info("Attempting to create an image of size %d x 1 = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size %d x 1 = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
 
     /* Try to allocate a very big image */
-    streams[0] = create_image_2d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimension, 1, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 maxDimension, 1, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
     {
-        print_error( error, "Image 2D creation failed for maximum width" );
+        print_error(error, "Image 2D creation failed for maximum width");
         return -1;
     }
 
     return 0;
 }
 
-int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
     int error;
     size_t maxDimension;
@@ -625,7 +756,7 @@ int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_c
     cl_uint minRequiredDimension;
     size_t length;
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
 
     auto version = get_device_cl_version(deviceID);
     if (version == Version(1, 0))
@@ -638,16 +769,20 @@ int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_c
     }
 
     /* Just get any ol format to test with */
-    error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D, CL_MEM_READ_WRITE, 0, &image_format_desc );
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D,
+                                   CL_MEM_READ_WRITE, 0, &image_format_desc);
+    test_error(error, "Unable to obtain suitable image format to test with!");
 
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 2d height from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 2d height from device");
 
-    if( maxDimension < minRequiredDimension )
+    if (maxDimension < minRequiredDimension)
     {
-        log_error( "ERROR: Reported max image 2d height is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 2d height is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     }
     log_info("Max reported height is %ld.\n", maxDimension);
@@ -655,56 +790,67 @@ int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_c
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE2D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     }
 
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
     }
 
-    log_info("Attempting to create an image of size 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x %d = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
 
     /* Try to allocate a very big image */
-    streams[0] = create_image_2d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, maxDimension, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 1, maxDimension, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
     {
-        print_error( error, "Image 2D creation failed for maximum height" );
+        print_error(error, "Image 2D creation failed for maximum height");
         return -1;
     }
 
     return 0;
 }
 
-int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
 
 
-    PASSIVE_REQUIRE_3D_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_3D_IMAGE_SUPPORT(deviceID)
 
     /* Just get any ol format to test with */
     error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D,
                                    CL_MEM_READ_ONLY, 0, &image_format_desc);
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    test_error(error, "Unable to obtain suitable image format to test with!");
 
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 3d width from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_WIDTH,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 3d width from device");
 
-    if( maxDimension < 2048 )
+    if (maxDimension < 2048)
     {
-        log_error( "ERROR: Reported max image 3d width is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 3d width is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     }
     log_info("Max reported width is %ld.\n", maxDimension);
@@ -712,56 +858,68 @@ int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context, cl_co
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE3D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     }
 
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*2*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*2*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 2 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 2 * 4, maxAllocSize);
         return -1;
     }
 
-    log_info("Attempting to create an image of size %d x 1 x 2 = %gMB.\n", (int)maxDimension, (2*(float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size %d x 1 x 2 = %gMB.\n",
+             (int)maxDimension,
+             (2 * (float)maxDimension * 4 / 1024.0 / 1024.0));
 
     /* Try to allocate a very big image */
-    streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimension, 1, 2, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 maxDimension, 1, 2, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
     {
-        print_error( error, "Image 3D creation failed for maximum width" );
+        print_error(error, "Image 3D creation failed for maximum width");
         return -1;
     }
 
     return 0;
 }
 
-int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
 
 
-    PASSIVE_REQUIRE_3D_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_3D_IMAGE_SUPPORT(deviceID)
 
     /* Just get any ol format to test with */
     error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D,
                                    CL_MEM_READ_ONLY, 0, &image_format_desc);
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    test_error(error, "Unable to obtain suitable image format to test with!");
 
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 3d height from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 3d height from device");
 
-    if( maxDimension < 2048 )
+    if (maxDimension < 2048)
     {
-        log_error( "ERROR: Reported max image 3d height is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 3d height is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     }
     log_info("Max reported height is %ld.\n", maxDimension);
@@ -769,27 +927,35 @@ int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_c
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE3D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     }
 
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*2*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*2*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 2 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 2 * 4, maxAllocSize);
         return -1;
     }
 
-    log_info("Attempting to create an image of size 1 x %d x 2 = %gMB.\n", (int)maxDimension, (2*(float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x %d x 2 = %gMB.\n",
+             (int)maxDimension,
+             (2 * (float)maxDimension * 4 / 1024.0 / 1024.0));
 
     /* Try to allocate a very big image */
-    streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, maxDimension, 2, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 1, maxDimension, 2, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
     {
-        print_error( error, "Image 3D creation failed for maximum height" );
+        print_error(error, "Image 3D creation failed for maximum height");
         return -1;
     }
 
@@ -797,29 +963,33 @@ int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_c
 }
 
 
-int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
 
 
-    PASSIVE_REQUIRE_3D_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_3D_IMAGE_SUPPORT(deviceID)
 
     /* Just get any ol format to test with */
     error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D,
                                    CL_MEM_READ_ONLY, 0, &image_format_desc);
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    test_error(error, "Unable to obtain suitable image format to test with!");
 
     /* Get the max 2d image width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image 3d depth from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_DEPTH,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image 3d depth from device");
 
-    if( maxDimension < 2048 )
+    if (maxDimension < 2048)
     {
-        log_error( "ERROR: Reported max image 3d depth is less than required! (%d)\n", (int)maxDimension );
+        log_error(
+            "ERROR: Reported max image 3d depth is less than required! (%d)\n",
+            (int)maxDimension);
         return -1;
     }
     log_info("Max reported depth is %ld.\n", maxDimension);
@@ -827,55 +997,67 @@ int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context, cl_co
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE3D, &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     }
 
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
     }
 
-    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
 
     /* Try to allocate a very big image */
-    streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, 1, maxDimension, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                                 1, 1, maxDimension, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
     {
-        print_error( error, "Image 3D creation failed for maximum depth" );
+        print_error(error, "Image 3D creation failed for maximum depth");
         return -1;
     }
 
     return 0;
 }
 
-int test_min_max_image_array_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_array_size(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     int error;
     size_t maxDimension;
     clMemWrapper streams[1];
-    cl_image_format    image_format_desc;
+    cl_image_format image_format_desc;
     cl_ulong maxAllocSize;
     size_t minRequiredDimension = gIsEmbedded ? 256 : 2048;
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID );
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID);
 
     /* Just get any ol format to test with */
-    error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D_ARRAY, CL_MEM_READ_WRITE, 0, &image_format_desc );
-    test_error( error, "Unable to obtain suitable image format to test with!" );
+    error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D_ARRAY,
+                                   CL_MEM_READ_WRITE, 0, &image_format_desc);
+    test_error(error, "Unable to obtain suitable image format to test with!");
 
     /* Get the max image array width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, sizeof( maxDimension ), &maxDimension, NULL );
-    test_error( error, "Unable to get max image array size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE,
+                            sizeof(maxDimension), &maxDimension, NULL);
+    test_error(error, "Unable to get max image array size from device");
 
-    if( maxDimension < minRequiredDimension )
+    if (maxDimension < minRequiredDimension)
     {
-        log_error( "ERROR: Reported max image array size is less than required! (%d)\n", (int)maxDimension );
+        log_error("ERROR: Reported max image array size is less than required! "
+                  "(%d)\n",
+                  (int)maxDimension);
         return -1;
     }
     log_info("Max reported image array size is %ld.\n", maxDimension);
@@ -883,96 +1065,127 @@ int test_min_max_image_array_size(cl_device_id deviceID, cl_context context, cl_
     /* Verify we can use the format */
     image_format_desc.image_channel_data_type = CL_UNORM_INT8;
     image_format_desc.image_channel_order = CL_RGBA;
-    if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D_ARRAY, &image_format_desc)) {
+    if (!is_image_format_supported(context, CL_MEM_READ_ONLY,
+                                   CL_MEM_OBJECT_IMAGE2D_ARRAY,
+                                   &image_format_desc))
+    {
         log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test.");
         return -1;
     }
 
     /* Verify that we can actually allocate an image that large */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
-    if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) {
-        log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n",
-                  (cl_ulong)maxDimension*1*4, maxAllocSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
+    if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize)
+    {
+        log_error("Can not allocate a large enough image (min size: %lld "
+                  "bytes, max allowed: %lld bytes) to test.\n",
+                  (cl_ulong)maxDimension * 1 * 4, maxAllocSize);
         return -1;
     }
 
-    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0));
+    log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n",
+             (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0));
 
     /* Try to allocate a very big image */
-    streams[0] = create_image_2d_array( context, CL_MEM_READ_ONLY, &image_format_desc, 1, 1, maxDimension, 0, 0, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] =
+        create_image_2d_array(context, CL_MEM_READ_ONLY, &image_format_desc, 1,
+                              1, maxDimension, 0, 0, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
     {
-        print_error( error, "2D Image Array creation failed for maximum array size" );
+        print_error(error,
+                    "2D Image Array creation failed for maximum array size");
         return -1;
     }
 
     return 0;
 }
 
-int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context,
+                                   cl_command_queue queue, int num_elements)
 {
     int error;
     size_t maxDimensionPixels;
     clMemWrapper streams[2];
-    cl_image_format image_format_desc = {0};
+    cl_image_format image_format_desc = { 0 };
     cl_ulong maxAllocSize;
     size_t minRequiredDimension = gIsEmbedded ? 2048 : 65536;
     unsigned int i = 0;
     size_t pixelBytes = 0;
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID );
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID);
 
     /* Get the max memory allocation size */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL );
-    test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, NULL);
+    test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE.");
 
     /* Get the max image array width */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof( maxDimensionPixels ), &maxDimensionPixels, NULL );
-    test_error( error, "Unable to get max image buffer size from device" );
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+                        sizeof(maxDimensionPixels), &maxDimensionPixels, NULL);
+    test_error(error, "Unable to get max image buffer size from device");
 
-    if( maxDimensionPixels < minRequiredDimension )
+    if (maxDimensionPixels < minRequiredDimension)
     {
-        log_error( "ERROR: Reported max image buffer size is less than required! (%d)\n", (int)maxDimensionPixels );
+        log_error("ERROR: Reported max image buffer size is less than "
+                  "required! (%d)\n",
+                  (int)maxDimensionPixels);
         return -1;
     }
-    log_info("Max reported image buffer size is %ld pixels.\n", maxDimensionPixels);
+    log_info("Max reported image buffer size is %ld pixels.\n",
+             maxDimensionPixels);
 
     pixelBytes = maxAllocSize / maxDimensionPixels;
-    if ( pixelBytes == 0 )
+    if (pixelBytes == 0)
     {
-        log_error( "Value of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE is greater than CL_MAX_MEM_ALLOC_SIZE so there is no way to allocate image of maximum size!\n" );
+        log_error("Value of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE is greater than "
+                  "CL_MAX_MEM_ALLOC_SIZE so there is no way to allocate image "
+                  "of maximum size!\n");
         return -1;
     }
 
     error = -1;
-    for ( i = pixelBytes; i > 0; --i )
+    for (i = pixelBytes; i > 0; --i)
     {
-        error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE1D, CL_MEM_READ_ONLY, i, &image_format_desc );
-        if ( error == CL_SUCCESS )
+        error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE1D,
+                                       CL_MEM_READ_ONLY, i, &image_format_desc);
+        if (error == CL_SUCCESS)
         {
             pixelBytes = i;
             break;
         }
     }
-    test_error( error, "Device does not support format to be used to allocate image of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE\n" );
+    test_error(error,
+               "Device does not support format to be used to allocate image of "
+               "CL_DEVICE_IMAGE_MAX_BUFFER_SIZE\n");
 
-    log_info("Attempting to create an 1D image with channel order %s from buffer of size %d = %gMB.\n",
-        GetChannelOrderName( image_format_desc.image_channel_order ), (int)maxDimensionPixels, ((float)maxDimensionPixels*pixelBytes/1024.0/1024.0));
+    log_info("Attempting to create an 1D image with channel order %s from "
+             "buffer of size %d = %gMB.\n",
+             GetChannelOrderName(image_format_desc.image_channel_order),
+             (int)maxDimensionPixels,
+             ((float)maxDimensionPixels * pixelBytes / 1024.0 / 1024.0));
 
     /* Try to allocate a buffer */
-    streams[0] = clCreateBuffer( context, CL_MEM_READ_ONLY, maxDimensionPixels*pixelBytes, NULL, &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
+    streams[0] = clCreateBuffer(context, CL_MEM_READ_ONLY,
+                                maxDimensionPixels * pixelBytes, NULL, &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
     {
-        print_error( error, "Buffer creation failed for maximum image buffer size" );
+        print_error(error,
+                    "Buffer creation failed for maximum image buffer size");
         return -1;
     }
 
     /* Try to allocate a 1D image array from buffer */
-    streams[1] = create_image_1d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimensionPixels, 0, NULL, streams[0], &error );
-    if( ( streams[0] == NULL ) || ( error != CL_SUCCESS ))
-    {
-        print_error( error, "1D Image from buffer creation failed for maximum image buffer size" );
+    streams[1] =
+        create_image_1d(context, CL_MEM_READ_ONLY, &image_format_desc,
+                        maxDimensionPixels, 0, NULL, streams[0], &error);
+    if ((streams[0] == NULL) || (error != CL_SUCCESS))
+    {
+        print_error(error,
+                    "1D Image from buffer creation failed for maximum image "
+                    "buffer size");
         return -1;
     }
 
@@ -980,8 +1193,8 @@ int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context, cl
 }
 
 
-
-int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_parameter_size(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
     int error, retVal, i;
     size_t maxSize;
@@ -1000,62 +1213,78 @@ int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_co
 
 
     /* Get the max param size */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxSize ), &maxSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxSize), &maxSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
 
 
-    if( ((!gIsEmbedded) && (maxSize < 1024)) || ((gIsEmbedded) && (maxSize < 256)) )
+    if (((!gIsEmbedded) && (maxSize < 1024))
+        || ((gIsEmbedded) && (maxSize < 256)))
     {
-        log_error( "ERROR: Reported max parameter size is less than required! (%d)\n", (int)maxSize );
+        log_error(
+            "ERROR: Reported max parameter size is less than required! (%d)\n",
+            (int)maxSize);
         return -1;
     }
 
     /* The embedded profile without cles_khr_int64 extension does not require
      * longs, so use ints */
     if (embeddedNoLong)
-        numberOfIntParametersToTry = numberExpected = (maxSize-sizeof(cl_mem))/sizeof(cl_int);
+        numberOfIntParametersToTry = numberExpected =
+            (maxSize - sizeof(cl_mem)) / sizeof(cl_int);
     else
-        numberOfIntParametersToTry = numberExpected = (maxSize-sizeof(cl_mem))/sizeof(cl_long);
+        numberOfIntParametersToTry = numberExpected =
+            (maxSize - sizeof(cl_mem)) / sizeof(cl_long);
 
-    decrement = (size_t)(numberOfIntParametersToTry/8);
-    if (decrement < 1)
-        decrement = 1;
+    decrement = (size_t)(numberOfIntParametersToTry / 8);
+    if (decrement < 1) decrement = 1;
     log_info("Reported max parameter size of %d bytes.\n", (int)maxSize);
 
-    while (numberOfIntParametersToTry > 0) {
-        // These need to be inside to be deallocated automatically on each loop iteration.
+    while (numberOfIntParametersToTry > 0)
+    {
+        // These need to be inside to be deallocated automatically on each loop
+        // iteration.
         clProgramWrapper program;
         clMemWrapper mem;
         clKernelWrapper kernel;
 
         if (embeddedNoLong)
         {
-            log_info("Trying a kernel with %ld int arguments (%ld bytes) and one cl_mem (%ld bytes) for %ld bytes total.\n",
-                     numberOfIntParametersToTry, sizeof(cl_int)*numberOfIntParametersToTry, sizeof(cl_mem),
-                     sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_int));
+            log_info(
+                "Trying a kernel with %ld int arguments (%ld bytes) and one "
+                "cl_mem (%ld bytes) for %ld bytes total.\n",
+                numberOfIntParametersToTry,
+                sizeof(cl_int) * numberOfIntParametersToTry, sizeof(cl_mem),
+                sizeof(cl_mem) + numberOfIntParametersToTry * sizeof(cl_int));
         }
         else
         {
-            log_info("Trying a kernel with %ld long arguments (%ld bytes) and one cl_mem (%ld bytes) for %ld bytes total.\n",
-                     numberOfIntParametersToTry, sizeof(cl_long)*numberOfIntParametersToTry, sizeof(cl_mem),
-                     sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_long));
+            log_info(
+                "Trying a kernel with %ld long arguments (%ld bytes) and one "
+                "cl_mem (%ld bytes) for %ld bytes total.\n",
+                numberOfIntParametersToTry,
+                sizeof(cl_long) * numberOfIntParametersToTry, sizeof(cl_mem),
+                sizeof(cl_mem) + numberOfIntParametersToTry * sizeof(cl_long));
         }
 
         // Allocate memory for the program storage
-        data = malloc(sizeof(cl_long)*numberOfIntParametersToTry);
-
-        argumentLine = (char*)malloc(sizeof(char)*numberOfIntParametersToTry*32);
-        codeLines = (char*)malloc(sizeof(char)*numberOfIntParametersToTry*32);
-        programSrc = (char*)malloc(sizeof(char)*(numberOfIntParametersToTry*64+1024));
+        data = malloc(sizeof(cl_long) * numberOfIntParametersToTry);
+
+        argumentLine =
+            (char *)malloc(sizeof(char) * numberOfIntParametersToTry * 32);
+        codeLines =
+            (char *)malloc(sizeof(char) * numberOfIntParametersToTry * 32);
+        programSrc = (char *)malloc(sizeof(char)
+                                    * (numberOfIntParametersToTry * 64 + 1024));
         argumentLine[0] = '\0';
         codeLines[0] = '\0';
         programSrc[0] = '\0';
 
         // Generate our results
         expectedResult = 0;
-        for (i=0; i<(int)numberOfIntParametersToTry; i++)
-            {
-            if( gHasLong )
+        for (i = 0; i < (int)numberOfIntParametersToTry; i++)
+        {
+            if (gHasLong)
             {
                 ((cl_long *)data)[i] = i;
                 expectedResult += i;
@@ -1068,30 +1297,35 @@ int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_co
         }
 
         // Build the program
-        if( gHasLong)
+        if (gHasLong)
             sprintf(argumentLine, "%s", "long arg0");
         else
             sprintf(argumentLine, "%s", "int arg0");
 
         sprintf(codeLines, "%s", "result[0] += arg0;");
-        for (i=1; i<(int)numberOfIntParametersToTry; i++)
+        for (i = 1; i < (int)numberOfIntParametersToTry; i++)
         {
-            if( gHasLong)
-                sprintf(argumentLine + strlen( argumentLine), ", long arg%d", i);
+            if (gHasLong)
+                sprintf(argumentLine + strlen(argumentLine), ", long arg%d", i);
             else
-                sprintf(argumentLine + strlen( argumentLine), ", int arg%d", i);
+                sprintf(argumentLine + strlen(argumentLine), ", int arg%d", i);
 
-            sprintf(codeLines + strlen( codeLines), "\nresult[0] += arg%d;", i);
+            sprintf(codeLines + strlen(codeLines), "\nresult[0] += arg%d;", i);
         }
 
         /* Create a kernel to test with */
-        sprintf( programSrc, gHasLong ?  sample_large_parmam_kernel_pattern[0]:
-                                        sample_large_int_parmam_kernel_pattern[0], argumentLine, codeLines);
+        sprintf(programSrc,
+                gHasLong ? sample_large_parmam_kernel_pattern[0]
+                         : sample_large_int_parmam_kernel_pattern[0],
+                argumentLine, codeLines);
 
         ptr = programSrc;
-        if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&ptr, "sample_test" ) != 0 )
+        if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&ptr, "sample_test")
+            != 0)
         {
-            log_info("Create program failed, decrementing number of parameters to try.\n");
+            log_info("Create program failed, decrementing number of parameters "
+                     "to try.\n");
             numberOfIntParametersToTry -= decrement;
             continue;
         }
@@ -1103,88 +1337,119 @@ int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_co
                              &error);
         test_error(error, "clCreateBuffer failed");
 
-        for (i=0; i<(int)numberOfIntParametersToTry; i++) {
-            if(gHasLong)
-                error = clSetKernelArg(kernel, i, sizeof(cl_long), &(((cl_long*)data)[i]));
+        for (i = 0; i < (int)numberOfIntParametersToTry; i++)
+        {
+            if (gHasLong)
+                error = clSetKernelArg(kernel, i, sizeof(cl_long),
+                                       &(((cl_long *)data)[i]));
             else
-                error = clSetKernelArg(kernel, i, sizeof(cl_int), &(((cl_int*)data)[i]));
+                error = clSetKernelArg(kernel, i, sizeof(cl_int),
+                                       &(((cl_int *)data)[i]));
 
-            if (error != CL_SUCCESS) {
-                log_info( "clSetKernelArg failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error));
+            if (error != CL_SUCCESS)
+            {
+                log_info("clSetKernelArg failed (%s), decrementing number of "
+                         "parameters to try.\n",
+                         IGetErrorString(error));
                 numberOfIntParametersToTry -= decrement;
                 break;
             }
         }
-        if (error != CL_SUCCESS)
-            continue;
+        if (error != CL_SUCCESS) continue;
 
 
         error = clSetKernelArg(kernel, i, sizeof(cl_mem), &mem);
-        if (error != CL_SUCCESS) {
-            log_info( "clSetKernelArg failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error));
+        if (error != CL_SUCCESS)
+        {
+            log_info("clSetKernelArg failed (%s), decrementing number of "
+                     "parameters to try.\n",
+                     IGetErrorString(error));
             numberOfIntParametersToTry -= decrement;
             continue;
         }
 
-        size_t globalDim[3]={1,1,1}, localDim[3]={1,1,1};
-        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim, 0, NULL, &event);
-        if (error != CL_SUCCESS) {
-            log_info( "clEnqueueNDRangeKernel failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error));
+        size_t globalDim[3] = { 1, 1, 1 }, localDim[3] = { 1, 1, 1 };
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim,
+                                       localDim, 0, NULL, &event);
+        if (error != CL_SUCCESS)
+        {
+            log_info("clEnqueueNDRangeKernel failed (%s), decrementing number "
+                     "of parameters to try.\n",
+                     IGetErrorString(error));
             numberOfIntParametersToTry -= decrement;
             continue;
         }
 
         // Verify that the event does not return an error from the execution
         error = clWaitForEvents(1, &event);
-        test_error( error, "clWaitForEvent failed");
-        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-        test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+        test_error(error, "clWaitForEvent failed");
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(event_status), &event_status, NULL);
+        test_error(
+            error,
+            "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
         clReleaseEvent(event);
         if (event_status < 0)
             test_error(error, "Kernel execution event returned error");
 
-        if(gHasLong)
-            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_long), &long_result, 0, NULL, NULL);
+        if (gHasLong)
+            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_long),
+                                        &long_result, 0, NULL, NULL);
         else
-            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_int), &int_result, 0, NULL, NULL);
+            error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_int),
+                                        &int_result, 0, NULL, NULL);
 
         test_error(error, "clEnqueueReadBuffer failed")
 
-        free(data);
+            free(data);
         free(argumentLine);
         free(codeLines);
         free(programSrc);
 
-        if(gHasLong)
+        if (gHasLong)
         {
-            if (long_result != expectedResult) {
-                log_error("Expected result (%lld) does not equal actual result (%lld).\n", expectedResult, long_result);
+            if (long_result != expectedResult)
+            {
+                log_error("Expected result (%lld) does not equal actual result "
+                          "(%lld).\n",
+                          expectedResult, long_result);
                 numberOfIntParametersToTry -= decrement;
                 continue;
-            } else {
-                log_info("Results verified at %ld bytes of arguments.\n", sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_long));
+            }
+            else
+            {
+                log_info("Results verified at %ld bytes of arguments.\n",
+                         sizeof(cl_mem)
+                             + numberOfIntParametersToTry * sizeof(cl_long));
                 break;
             }
         }
         else
         {
-            if (int_result != expectedResult) {
-                log_error("Expected result (%lld) does not equal actual result (%d).\n", expectedResult, int_result);
+            if (int_result != expectedResult)
+            {
+                log_error("Expected result (%lld) does not equal actual result "
+                          "(%d).\n",
+                          expectedResult, int_result);
                 numberOfIntParametersToTry -= decrement;
                 continue;
-            } else {
-                log_info("Results verified at %ld bytes of arguments.\n", sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_int));
+            }
+            else
+            {
+                log_info("Results verified at %ld bytes of arguments.\n",
+                         sizeof(cl_mem)
+                             + numberOfIntParametersToTry * sizeof(cl_int));
                 break;
             }
         }
     }
 
-    if (numberOfIntParametersToTry == (long)numberExpected)
-        return 0;
+    if (numberOfIntParametersToTry == (long)numberExpected) return 0;
     return -1;
 }
 
-int test_min_max_samplers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_samplers(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
 {
     int error;
     cl_uint maxSamplers, i;
@@ -1197,104 +1462,124 @@ int test_min_max_samplers(cl_device_id deviceID, cl_context context, cl_command_
     cl_uint minRequiredSamplers = gIsEmbedded ? 8 : 16;
 
 
-    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID)
 
     /* Get the max value */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_SAMPLERS, sizeof( maxSamplers ), &maxSamplers, NULL );
-    test_error( error, "Unable to get max sampler count from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_SAMPLERS,
+                            sizeof(maxSamplers), &maxSamplers, NULL);
+    test_error(error, "Unable to get max sampler count from device");
 
-    if( maxSamplers < minRequiredSamplers )
+    if (maxSamplers < minRequiredSamplers)
     {
-        log_error( "ERROR: Reported max sampler count is less than required! (%d)\n", (int)maxSamplers );
+        log_error(
+            "ERROR: Reported max sampler count is less than required! (%d)\n",
+            (int)maxSamplers);
         return -1;
     }
 
     log_info("Reported max %d samplers.\n", maxSamplers);
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
 
     // Subtract the size of the result
-    maxParameterSize -= 2*sizeof(cl_mem);
+    maxParameterSize -= 2 * sizeof(cl_mem);
 
     // Calculate the number we can use
-    if (maxParameterSize/sizeof(cl_sampler) < maxSamplers) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max sampler arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_sampler)));
-        maxSamplers = (unsigned int)(maxParameterSize/sizeof(cl_sampler));
+    if (maxParameterSize / sizeof(cl_sampler) < maxSamplers)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max sampler arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / sizeof(cl_sampler)));
+        maxSamplers = (unsigned int)(maxParameterSize / sizeof(cl_sampler));
     }
 
     /* Create a kernel to test with */
-    programSrc = (char *)malloc( ( strlen( sample_sampler_kernel_pattern[ 1 ] ) + 8 ) * ( maxSamplers ) +
-                                strlen( sample_sampler_kernel_pattern[ 0 ] ) + strlen( sample_sampler_kernel_pattern[ 2 ] ) +
-                                ( strlen( sample_sampler_kernel_pattern[ 3 ] ) + 8 ) * maxSamplers +
-                                strlen( sample_sampler_kernel_pattern[ 4 ] ) );
-    strcpy( programSrc, sample_sampler_kernel_pattern[ 0 ] );
-    for( i = 0; i < maxSamplers; i++ )
+    programSrc = (char *)malloc(
+        (strlen(sample_sampler_kernel_pattern[1]) + 8) * (maxSamplers)
+        + strlen(sample_sampler_kernel_pattern[0])
+        + strlen(sample_sampler_kernel_pattern[2])
+        + (strlen(sample_sampler_kernel_pattern[3]) + 8) * maxSamplers
+        + strlen(sample_sampler_kernel_pattern[4]));
+    strcpy(programSrc, sample_sampler_kernel_pattern[0]);
+    for (i = 0; i < maxSamplers; i++)
     {
-        sprintf( samplerLine, sample_sampler_kernel_pattern[ 1 ], i );
-        strcat( programSrc, samplerLine );
+        sprintf(samplerLine, sample_sampler_kernel_pattern[1], i);
+        strcat(programSrc, samplerLine);
     }
-    strcat( programSrc, sample_sampler_kernel_pattern[ 2 ] );
-    for( i = 0; i < maxSamplers; i++ )
+    strcat(programSrc, sample_sampler_kernel_pattern[2]);
+    for (i = 0; i < maxSamplers; i++)
     {
-        sprintf( samplerLine, sample_sampler_kernel_pattern[ 3 ], i );
-        strcat( programSrc, samplerLine );
+        sprintf(samplerLine, sample_sampler_kernel_pattern[3], i);
+        strcat(programSrc, samplerLine);
     }
-    strcat( programSrc, sample_sampler_kernel_pattern[ 4 ] );
+    strcat(programSrc, sample_sampler_kernel_pattern[4]);
 
 
-    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test");
-    test_error( error, "Failed to create the program and kernel.");
+    error =
+        create_single_kernel_helper(context, &program, &kernel, 1,
+                                    (const char **)&programSrc, "sample_test");
+    test_error(error, "Failed to create the program and kernel.");
 
     // We have to set up some fake parameters so it'll work
     clSamplerWrapper *samplers = new clSamplerWrapper[maxSamplers];
 
     cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 };
 
-    clMemWrapper image = create_image_2d( context, CL_MEM_READ_WRITE, &format, 16, 16, 0, NULL, &error );
-    test_error( error, "Unable to create a test image" );
+    clMemWrapper image = create_image_2d(context, CL_MEM_READ_WRITE, &format,
+                                         16, 16, 0, NULL, &error);
+    test_error(error, "Unable to create a test image");
 
     clMemWrapper stream =
         clCreateBuffer(context, CL_MEM_READ_WRITE, 16, NULL, &error);
-    test_error( error, "Unable to create test buffer" );
+    test_error(error, "Unable to create test buffer");
 
-    error = clSetKernelArg( kernel, 0, sizeof( cl_mem ), &image );
-    error |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), &stream );
-    test_error( error, "Unable to set kernel arguments" );
-    for( i = 0; i < maxSamplers; i++ )
+    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
+    error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &stream);
+    test_error(error, "Unable to set kernel arguments");
+    for (i = 0; i < maxSamplers; i++)
     {
-        samplers[ i ] = clCreateSampler( context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
-        test_error( error, "Unable to create sampler" );
+        samplers[i] = clCreateSampler(context, CL_FALSE, CL_ADDRESS_NONE,
+                                      CL_FILTER_NEAREST, &error);
+        test_error(error, "Unable to create sampler");
 
-        error = clSetKernelArg( kernel, 2 + i, sizeof( cl_sampler ), &samplers[ i ] );
-        test_error( error, "Unable to set sampler argument" );
+        error = clSetKernelArg(kernel, 2 + i, sizeof(cl_sampler), &samplers[i]);
+        test_error(error, "Unable to set sampler argument");
     }
 
-    size_t globalDim[3]={1,1,1}, localDim[3]={1,1,1};
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim, 0, NULL, &event);
-    test_error(error, "clEnqueueNDRangeKernel failed with maximum number of samplers.");
+    size_t globalDim[3] = { 1, 1, 1 }, localDim[3] = { 1, 1, 1 };
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim,
+                                   0, NULL, &event);
+    test_error(
+        error,
+        "clEnqueueNDRangeKernel failed with maximum number of samplers.");
 
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     clReleaseEvent(event);
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
 
-    free( programSrc );
+    free(programSrc);
     delete[] samplers;
     return 0;
 }
 
 #define PASSING_FRACTION 4
-int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
 {
     int error;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    size_t    threads[1], localThreads[1];
+    size_t threads[1], localThreads[1];
     cl_int *constantData, *resultData;
     cl_ulong maxSize, stepSize, currentSize, maxGlobalSize, maxAllocSize;
     int i;
@@ -1303,48 +1588,56 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
     MTdata d;
 
     /* Verify our test buffer won't be bigger than allowed */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( error, "Unable to get max constant buffer size" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+                            sizeof(maxSize), &maxSize, 0);
+    test_error(error, "Unable to get max constant buffer size");
 
-    if( ( 0 == gIsEmbedded && maxSize < 64L * 1024L ) || maxSize <  1L * 1024L )
+    if ((0 == gIsEmbedded && maxSize < 64L * 1024L) || maxSize < 1L * 1024L)
     {
-        log_error( "ERROR: Reported max constant buffer size less than required by OpenCL 1.0 (reported %d KB)\n", (int)( maxSize / 1024L ) );
+        log_error("ERROR: Reported max constant buffer size less than required "
+                  "by OpenCL 1.0 (reported %d KB)\n",
+                  (int)(maxSize / 1024L));
         return -1;
     }
 
     log_info("Reported max constant buffer size of %lld bytes.\n", maxSize);
 
     // Limit test buffer size to 1/8 of CL_DEVICE_GLOBAL_MEM_SIZE
-    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalSize), &maxGlobalSize, 0);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(maxGlobalSize), &maxGlobalSize, 0);
     test_error(error, "Unable to get CL_DEVICE_GLOBAL_MEM_SIZE");
 
-    if (maxSize > maxGlobalSize / 8)
-        maxSize = maxGlobalSize / 8;
+    if (maxSize > maxGlobalSize / 8) maxSize = maxGlobalSize / 8;
 
-    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(maxAllocSize), &maxAllocSize, 0);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(maxAllocSize), &maxAllocSize, 0);
     test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE ");
-    
-    if (maxSize > maxAllocSize)
-        maxSize = maxAllocSize;
-    
+
+    if (maxSize > maxAllocSize) maxSize = maxAllocSize;
+
     /* Create a kernel to test with */
-    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_const_arg_kernel, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    sample_const_arg_kernel, "sample_test")
+        != 0)
     {
         return -1;
     }
 
     /* Try the returned max size and decrease it until we get one that works. */
-    stepSize = maxSize/16;
+    stepSize = maxSize / 16;
     currentSize = maxSize;
     int allocPassed = 0;
-    d = init_genrand( gRandomSeed );
-    while (!allocPassed && currentSize >= maxSize/PASSING_FRACTION) {
-        log_info("Attempting to allocate constant buffer of size %lld bytes\n", maxSize);
+    d = init_genrand(gRandomSeed);
+    while (!allocPassed && currentSize >= maxSize / PASSING_FRACTION)
+    {
+        log_info("Attempting to allocate constant buffer of size %lld bytes\n",
+                 maxSize);
 
         /* Create some I/O streams */
-        size_t sizeToAllocate = ((size_t)currentSize/sizeof( cl_int ))*sizeof(cl_int);
-        size_t numberOfInts = sizeToAllocate/sizeof(cl_int);
-        constantData = (cl_int *)malloc( sizeToAllocate);
+        size_t sizeToAllocate =
+            ((size_t)currentSize / sizeof(cl_int)) * sizeof(cl_int);
+        size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
+        constantData = (cl_int *)malloc(sizeToAllocate);
         if (constantData == NULL)
         {
             log_error("Failed to allocate memory for constantData!\n");
@@ -1352,53 +1645,74 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
             return EXIT_FAILURE;
         }
 
-        for(i=0; i<(int)(numberOfInts); i++)
+        for (i = 0; i < (int)(numberOfInts); i++)
             constantData[i] = (int)genrand_int32(d);
 
         clMemWrapper streams[3];
         streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                                     sizeToAllocate, constantData, &error);
-        test_error( error, "Creating test array failed" );
+        test_error(error, "Creating test array failed");
         streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
                                     NULL, &error);
-        test_error( error, "Creating test array failed" );
+        test_error(error, "Creating test array failed");
 
 
         /* Set the arguments */
-        error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]);
-        test_error( error, "Unable to set indexed kernel arguments" );
-        error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]);
-        test_error( error, "Unable to set indexed kernel arguments" );
+        error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+        test_error(error, "Unable to set indexed kernel arguments");
 
 
         /* Test running the kernel and verifying it */
         threads[0] = numberOfInts;
         localThreads[0] = 1;
-        log_info("Filling constant buffer with %d cl_ints (%d bytes).\n", (int)threads[0], (int)(threads[0]*sizeof(cl_int)));
-
-        error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &event );
-        /* If we failed due to a resource issue, reduce the size and try again. */
-        if ((error == CL_OUT_OF_RESOURCES) || (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) || (error == CL_OUT_OF_HOST_MEMORY)) {
-            log_info("Kernel enqueue failed at size %lld, trying at a reduced size.\n", currentSize);
+        log_info("Filling constant buffer with %d cl_ints (%d bytes).\n",
+                 (int)threads[0], (int)(threads[0] * sizeof(cl_int)));
+
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                       localThreads, 0, NULL, &event);
+        /* If we failed due to a resource issue, reduce the size and try again.
+         */
+        if ((error == CL_OUT_OF_RESOURCES)
+            || (error == CL_MEM_OBJECT_ALLOCATION_FAILURE)
+            || (error == CL_OUT_OF_HOST_MEMORY))
+        {
+            log_info("Kernel enqueue failed at size %lld, trying at a reduced "
+                     "size.\n",
+                     currentSize);
             currentSize -= stepSize;
             free(constantData);
             continue;
         }
-        test_error( error, "clEnqueueNDRangeKernel with maximum constant buffer size failed.");
+        test_error(
+            error,
+            "clEnqueueNDRangeKernel with maximum constant buffer size failed.");
 
         // Verify that the event does not return an error from the execution
         error = clWaitForEvents(1, &event);
-        test_error( error, "clWaitForEvent failed");
-        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-        test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+        test_error(error, "clWaitForEvent failed");
+        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                               sizeof(event_status), &event_status, NULL);
+        test_error(
+            error,
+            "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
         clReleaseEvent(event);
-        if (event_status < 0) {
-            if ((event_status == CL_OUT_OF_RESOURCES) || (event_status == CL_MEM_OBJECT_ALLOCATION_FAILURE) || (event_status == CL_OUT_OF_HOST_MEMORY)) {
-                log_info("Kernel event indicates failure at size %lld, trying at a reduced size.\n", currentSize);
+        if (event_status < 0)
+        {
+            if ((event_status == CL_OUT_OF_RESOURCES)
+                || (event_status == CL_MEM_OBJECT_ALLOCATION_FAILURE)
+                || (event_status == CL_OUT_OF_HOST_MEMORY))
+            {
+                log_info("Kernel event indicates failure at size %lld, trying "
+                         "at a reduced size.\n",
+                         currentSize);
                 currentSize -= stepSize;
                 free(constantData);
                 continue;
-            } else {
+            }
+            else
+            {
                 test_error(error, "Kernel execution event returned error");
             }
         }
@@ -1415,30 +1729,41 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
             return EXIT_FAILURE;
         }
 
-        error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate, resultData, 0, NULL, NULL);
-        test_error( error, "clEnqueueReadBuffer failed");
+        error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                    sizeToAllocate, resultData, 0, NULL, NULL);
+        test_error(error, "clEnqueueReadBuffer failed");
 
-        for(i=0; i<(int)(numberOfInts); i++)
-            if (constantData[i] != resultData[i]) {
-                log_error("Data failed to verify: constantData[%d]=%d != resultData[%d]=%d\n",
+        for (i = 0; i < (int)(numberOfInts); i++)
+            if (constantData[i] != resultData[i])
+            {
+                log_error("Data failed to verify: constantData[%d]=%d != "
+                          "resultData[%d]=%d\n",
                           i, constantData[i], i, resultData[i]);
-                free( constantData );
+                free(constantData);
                 free(resultData);
-                free_mtdata(d);   d = NULL;
+                free_mtdata(d);
+                d = NULL;
                 return -1;
             }
 
-        free( constantData );
+        free(constantData);
         free(resultData);
     }
-    free_mtdata(d);   d = NULL;
+    free_mtdata(d);
+    d = NULL;
 
-    if (allocPassed) {
-        if (currentSize < maxSize/PASSING_FRACTION) {
-            log_error("Failed to allocate at least 1/8 of the reported constant size.\n");
+    if (allocPassed)
+    {
+        if (currentSize < maxSize / PASSING_FRACTION)
+        {
+            log_error("Failed to allocate at least 1/8 of the reported "
+                      "constant size.\n");
             return -1;
-        } else if (currentSize != maxSize) {
-            log_info("Passed at reduced size. (%lld of %lld bytes)\n", currentSize, maxSize);
+        }
+        else if (currentSize != maxSize)
+        {
+            log_info("Passed at reduced size. (%lld of %lld bytes)\n",
+                     currentSize, maxSize);
             return 0;
         }
         return 0;
@@ -1446,13 +1771,14 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
     return -1;
 }
 
-int test_min_max_constant_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_constant_args(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements)
 {
     int error;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper    *streams;
-    size_t    threads[1], localThreads[1];
+    clMemWrapper *streams;
+    size_t threads[1], localThreads[1];
     cl_uint i, maxArgs;
     cl_ulong maxSize;
     cl_ulong maxParameterSize;
@@ -1465,119 +1791,145 @@ int test_min_max_constant_args(cl_device_id deviceID, cl_context context, cl_com
 
 
     /* Verify our test buffer won't be bigger than allowed */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof( maxArgs ), &maxArgs, 0 );
-    test_error( error, "Unable to get max constant arg count" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_ARGS,
+                            sizeof(maxArgs), &maxArgs, 0);
+    test_error(error, "Unable to get max constant arg count");
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL );
-    test_error( error, "Unable to get max parameter size from device" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE,
+                            sizeof(maxParameterSize), &maxParameterSize, NULL);
+    test_error(error, "Unable to get max parameter size from device");
 
     // Subtract the size of the result
     maxParameterSize -= sizeof(cl_mem);
 
     // Calculate the number we can use
-    if (maxParameterSize/sizeof(cl_mem) < maxArgs) {
-        log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_mem)));
-        maxArgs = (unsigned int)(maxParameterSize/sizeof(cl_mem));
+    if (maxParameterSize / sizeof(cl_mem) < maxArgs)
+    {
+        log_info("WARNING: Max parameter size of %d bytes limits test to %d "
+                 "max image arguments.\n",
+                 (int)maxParameterSize,
+                 (int)(maxParameterSize / sizeof(cl_mem)));
+        maxArgs = (unsigned int)(maxParameterSize / sizeof(cl_mem));
     }
 
 
-    if( maxArgs < (gIsEmbedded ? 4 : 8) )
+    if (maxArgs < (gIsEmbedded ? 4 : 8))
     {
-        log_error( "ERROR: Reported max constant arg count less than required by OpenCL 1.0 (reported %d)\n", (int)maxArgs );
+        log_error("ERROR: Reported max constant arg count less than required "
+                  "by OpenCL 1.0 (reported %d)\n",
+                  (int)maxArgs);
         return -1;
     }
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( error, "Unable to get max constant buffer size" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+                            sizeof(maxSize), &maxSize, 0);
+    test_error(error, "Unable to get max constant buffer size");
     individualBufferSize = (maxSize / 2) / maxArgs;
 
-    log_info("Reported max constant arg count of %d and max constant buffer size of %d. Test will attempt to allocate half of that, or %d buffers of size %d.\n",
-             (int)maxArgs, (int)maxSize, (int)maxArgs, (int)individualBufferSize);
+    log_info(
+        "Reported max constant arg count of %u and max constant buffer "
+        "size of %llu. Test will attempt to allocate half of that, or %llu "
+        "buffers of size %zu.\n",
+        maxArgs, maxSize, maxArgs, individualBufferSize);
 
-    str2 = (char*)malloc(sizeof(char)*32*(maxArgs+2));
-    constArgs = (char*)malloc(sizeof(char)*32*(maxArgs+2));
-    programSrc = (char*)malloc(sizeof(char)*32*2*(maxArgs+2)+1024);
+    str2 = (char *)malloc(sizeof(char) * 32 * (maxArgs + 2));
+    constArgs = (char *)malloc(sizeof(char) * 32 * (maxArgs + 2));
+    programSrc = (char *)malloc(sizeof(char) * 32 * 2 * (maxArgs + 2) + 1024);
 
     /* Create a test program */
     constArgs[0] = 0;
     str2[0] = 0;
-    for( i = 0; i < maxArgs-1; i++ )
-    {
-        sprintf( str, ", __constant int *src%d", (int)( i + 2 ) );
-        strcat( constArgs, str );
-        sprintf( str2 + strlen( str2), "\tdst[tid] += src%d[tid];\n", (int)(i+2));
-        if (strlen(str2) > (sizeof(char)*32*(maxArgs+2)-32) || strlen(constArgs) > (sizeof(char)*32*(maxArgs+2)-32)) {
-            log_info("Limiting number of arguments tested to %d due to test program allocation size.\n", i);
+    for (i = 0; i < maxArgs - 1; i++)
+    {
+        sprintf(str, ", __constant int *src%d", (int)(i + 2));
+        strcat(constArgs, str);
+        sprintf(str2 + strlen(str2), "\tdst[tid] += src%d[tid];\n",
+                (int)(i + 2));
+        if (strlen(str2) > (sizeof(char) * 32 * (maxArgs + 2) - 32)
+            || strlen(constArgs) > (sizeof(char) * 32 * (maxArgs + 2) - 32))
+        {
+            log_info("Limiting number of arguments tested to %d due to test "
+                     "program allocation size.\n",
+                     i);
             break;
         }
     }
-    sprintf( programSrc, sample_const_max_arg_kernel_pattern, constArgs, str2 );
+    sprintf(programSrc, sample_const_max_arg_kernel_pattern, constArgs, str2);
 
     /* Create a kernel to test with */
     ptr = programSrc;
-    if( create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1, &ptr,
+                                    "sample_test")
+        != 0)
     {
         return -1;
     }
 
     /* Create some I/O streams */
-    streams = new clMemWrapper[ maxArgs + 1 ];
-    for( i = 0; i < maxArgs + 1; i++ )
+    streams = new clMemWrapper[maxArgs + 1];
+    for (i = 0; i < maxArgs + 1; i++)
     {
         streams[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                     individualBufferSize, NULL, &error);
-        test_error( error, "Creating test array failed" );
+        test_error(error, "Creating test array failed");
     }
 
     /* Set the arguments */
-    for( i = 0; i < maxArgs + 1; i++ )
+    for (i = 0; i < maxArgs + 1; i++)
     {
-        error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
-        test_error( error, "Unable to set kernel argument" );
+        error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]);
+        test_error(error, "Unable to set kernel argument");
     }
 
     /* Test running the kernel and verifying it */
     threads[0] = (size_t)10;
-    while (threads[0]*sizeof(cl_int) > individualBufferSize)
-        threads[0]--;
+    while (threads[0] * sizeof(cl_int) > individualBufferSize) threads[0]--;
 
-    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
-    test_error( error, "Unable to get work group size to use" );
+    error = get_max_common_work_group_size(context, kernel, threads[0],
+                                           &localThreads[0]);
+    test_error(error, "Unable to get work group size to use");
 
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &event );
-    test_error( error, "clEnqueueNDRangeKernel failed");
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &event);
+    test_error(error, "clEnqueueNDRangeKernel failed");
 
     // Verify that the event does not return an error from the execution
     error = clWaitForEvents(1, &event);
-    test_error( error, "clWaitForEvent failed");
-    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-    test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    test_error(error, "clWaitForEvent failed");
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(event_status), &event_status, NULL);
+    test_error(error,
+               "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
     clReleaseEvent(event);
     if (event_status < 0)
         test_error(error, "Kernel execution event returned error");
 
     error = clFinish(queue);
-    test_error( error, "clFinish failed.");
+    test_error(error, "clFinish failed.");
 
-    delete [] streams;
+    delete[] streams;
     free(str2);
     free(constArgs);
     free(programSrc);
     return 0;
 }
 
-int test_min_max_compute_units(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_compute_units(cl_device_id deviceID, cl_context context,
+                               cl_command_queue queue, int num_elements)
 {
     int error;
     cl_uint value;
 
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get compute unit count" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(value), &value, 0);
+    test_error(error, "Unable to get compute unit count");
 
-    if( value < 1 )
+    if (value < 1)
     {
-        log_error( "ERROR: Reported compute unit count less than required by OpenCL 1.0 (reported %d)\n", (int)value );
+        log_error("ERROR: Reported compute unit count less than required by "
+                  "OpenCL 1.0 (reported %d)\n",
+                  (int)value);
         return -1;
     }
 
@@ -1586,18 +1938,22 @@ int test_min_max_compute_units(cl_device_id deviceID, cl_context context, cl_com
     return 0;
 }
 
-int test_min_max_address_bits(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_address_bits(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, int num_elements)
 {
     int error;
     cl_uint value;
 
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_ADDRESS_BITS, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get address bit count" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS, sizeof(value),
+                            &value, 0);
+    test_error(error, "Unable to get address bit count");
 
-    if( value != 32 && value != 64 )
+    if (value != 32 && value != 64)
     {
-        log_error( "ERROR: Reported address bit count not valid by OpenCL 1.0 (reported %d)\n", (int)value );
+        log_error("ERROR: Reported address bit count not valid by OpenCL 1.0 "
+                  "(reported %d)\n",
+                  (int)value);
         return -1;
     }
 
@@ -1606,68 +1962,84 @@ int test_min_max_address_bits(cl_device_id deviceID, cl_context context, cl_comm
     return 0;
 }
 
-int test_min_max_single_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_single_fp_config(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     int error;
     cl_device_fp_config value;
     char profile[128] = "";
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get device single fp config" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(value),
+                            &value, 0);
+    test_error(error, "Unable to get device single fp config");
 
-    //Check to see if we are an embedded profile device
-    if((error = clGetDeviceInfo( deviceID, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL )))
+    // Check to see if we are an embedded profile device
+    if ((error = clGetDeviceInfo(deviceID, CL_DEVICE_PROFILE, sizeof(profile),
+                                 profile, NULL)))
     {
-        log_error( "FAILURE: Unable to get CL_DEVICE_PROFILE: error %d\n", error );
+        log_error("FAILURE: Unable to get CL_DEVICE_PROFILE: error %d\n",
+                  error);
         return error;
     }
 
-    if( 0 == strcmp( profile, "EMBEDDED_PROFILE" ))
+    if (0 == strcmp(profile, "EMBEDDED_PROFILE"))
     { // embedded device
 
-        if( 0 == (value & (CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO)))
+        if (0 == (value & (CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO)))
         {
-            log_error( "FAILURE: embedded device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" );
+            log_error("FAILURE: embedded device supports neither "
+                      "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n");
             return -1;
         }
     }
     else
     { // Full profile
-        if( ( value & ( CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN )) != ( CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN ) )
+        if ((value & (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN))
+            != (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN))
         {
-            log_error( "ERROR: Reported single fp config doesn't meet minimum set by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+            log_error("ERROR: Reported single fp config doesn't meet minimum "
+                      "set by OpenCL 1.0 (reported 0x%08x)\n",
+                      (int)value);
             return -1;
         }
     }
     return 0;
 }
 
-int test_min_max_double_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_double_fp_config(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     int error;
     cl_device_fp_config value;
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get device double fp config" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(value),
+                            &value, 0);
+    test_error(error, "Unable to get device double fp config");
 
-    if (value == 0)
-        return 0;
+    if (value == 0) return 0;
 
-    if( ( value & (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)) != ( CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM) )
+    if ((value
+         & (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
+            | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM))
+        != (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
+            | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM))
     {
-        log_error( "ERROR: Reported double fp config doesn't meet minimum set by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+        log_error("ERROR: Reported double fp config doesn't meet minimum set "
+                  "by OpenCL 1.0 (reported 0x%08x)\n",
+                  (int)value);
         return -1;
     }
     return 0;
 }
 
-int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
     int error;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper            streams[3];
-    size_t    threads[1], localThreads[1];
+    clMemWrapper streams[3];
+    size_t threads[1], localThreads[1];
     cl_int *localData, *resultData;
     cl_ulong maxSize, kernelLocalUsage, min_max_local_mem_size;
     Version device_version;
@@ -1676,8 +2048,9 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co
     MTdata d;
 
     /* Verify our test buffer won't be bigger than allowed */
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( maxSize ), &maxSize, 0 );
-    test_error( error, "Unable to get max local buffer size" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(maxSize),
+                            &maxSize, 0);
+    test_error(error, "Unable to get max local buffer size");
 
     try
     {
@@ -1709,65 +2082,80 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co
         return -1;
     }
 
-    log_info("Reported max local buffer size for device: %lld bytes.\n", maxSize);
+    log_info("Reported max local buffer size for device: %lld bytes.\n",
+             maxSize);
 
     /* Create a kernel to test with */
-    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_local_arg_kernel, "sample_test" ) != 0 )
+    if (create_single_kernel_helper(context, &program, &kernel, 1,
+                                    sample_local_arg_kernel, "sample_test")
+        != 0)
     {
         return -1;
     }
 
-    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernelLocalUsage), &kernelLocalUsage, NULL);
-    test_error(error, "clGetKernelWorkGroupInfo for CL_KERNEL_LOCAL_MEM_SIZE failed");
+    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
+                                     sizeof(kernelLocalUsage),
+                                     &kernelLocalUsage, NULL);
+    test_error(error,
+               "clGetKernelWorkGroupInfo for CL_KERNEL_LOCAL_MEM_SIZE failed");
 
-    log_info("Reported local buffer usage for kernel (CL_KERNEL_LOCAL_MEM_SIZE): %lld bytes.\n", kernelLocalUsage);
+    log_info("Reported local buffer usage for kernel "
+             "(CL_KERNEL_LOCAL_MEM_SIZE): %lld bytes.\n",
+             kernelLocalUsage);
 
     /* Create some I/O streams */
-    size_t sizeToAllocate = ((size_t)(maxSize-kernelLocalUsage)/sizeof( cl_int ))*sizeof(cl_int);
-    size_t numberOfInts = sizeToAllocate/sizeof(cl_int);
+    size_t sizeToAllocate =
+        ((size_t)(maxSize - kernelLocalUsage) / sizeof(cl_int))
+        * sizeof(cl_int);
+    size_t numberOfInts = sizeToAllocate / sizeof(cl_int);
 
-    log_info("Attempting to use %lld bytes of local memory.\n", (cl_ulong)sizeToAllocate);
+    log_info("Attempting to use %zu bytes of local memory.\n", sizeToAllocate);
 
-    localData = (cl_int *)malloc( sizeToAllocate );
-    d = init_genrand( gRandomSeed );
-    for(i=0; i<(int)(numberOfInts); i++)
+    localData = (cl_int *)malloc(sizeToAllocate);
+    d = init_genrand(gRandomSeed);
+    for (i = 0; i < (int)(numberOfInts); i++)
         localData[i] = (int)genrand_int32(d);
-    free_mtdata(d); d = NULL;
+    free_mtdata(d);
+    d = NULL;
 
     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate,
                                 localData, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate,
                                 NULL, &error);
-    test_error( error, "Creating test array failed" );
+    test_error(error, "Creating test array failed");
 
 
     /* Set the arguments */
     error = clSetKernelArg(kernel, 0, sizeToAllocate, NULL);
-    test_error( error, "Unable to set indexed kernel arguments" );
-    error = clSetKernelArg(kernel, 1, sizeof( streams[0] ), &streams[0]);
-    test_error( error, "Unable to set indexed kernel arguments" );
-    error = clSetKernelArg(kernel, 2, sizeof( streams[1] ), &streams[1]);
-    test_error( error, "Unable to set indexed kernel arguments" );
+    test_error(error, "Unable to set indexed kernel arguments");
+    error = clSetKernelArg(kernel, 1, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set indexed kernel arguments");
+    error = clSetKernelArg(kernel, 2, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set indexed kernel arguments");
 
 
     /* Test running the kernel and verifying it */
     threads[0] = numberOfInts;
     localThreads[0] = 1;
-    log_info("Creating local buffer with %d cl_ints (%d bytes).\n", (int)numberOfInts, (int)sizeToAllocate);
+    log_info("Creating local buffer with %zu cl_ints (%zu bytes).\n",
+             numberOfInts, sizeToAllocate);
 
     cl_event evt;
-    cl_int   evt_err;
-    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &evt );
+    cl_int evt_err;
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, &evt);
     test_error(error, "clEnqueueNDRangeKernel failed");
 
     error = clFinish(queue);
-    test_error( error, "clFinish failed");
+    test_error(error, "clFinish failed");
 
-    error = clGetEventInfo(evt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof evt_err, &evt_err, NULL);
-    test_error( error, "clGetEventInfo with maximum local buffer size failed.");
+    error = clGetEventInfo(evt, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof evt_err, &evt_err, NULL);
+    test_error(error, "clGetEventInfo with maximum local buffer size failed.");
 
-    if (evt_err != CL_COMPLETE) {
+    if (evt_err != CL_COMPLETE)
+    {
         print_error(evt_err, "Kernel event returned error");
         clReleaseEvent(evt);
         return -1;
@@ -1775,95 +2163,118 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co
 
     resultData = (cl_int *)malloc(sizeToAllocate);
 
-    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate, resultData, 0, NULL, NULL);
-    test_error( error, "clEnqueueReadBuffer failed");
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate,
+                                resultData, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
 
-    for(i=0; i<(int)(numberOfInts); i++)
-        if (localData[i] != resultData[i]) {
+    for (i = 0; i < (int)(numberOfInts); i++)
+        if (localData[i] != resultData[i])
+        {
             clReleaseEvent(evt);
-            free( localData );
+            free(localData);
             free(resultData);
             log_error("Results failed to verify.\n");
             return -1;
         }
     clReleaseEvent(evt);
-    free( localData );
+    free(localData);
     free(resultData);
 
     return err;
 }
 
-int test_min_max_kernel_preferred_work_group_size_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_kernel_preferred_work_group_size_multiple(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
 {
-    int                err;
+    int err;
     clProgramWrapper program;
     clKernelWrapper kernel;
 
     size_t max_local_workgroup_size[3];
     size_t max_workgroup_size = 0, preferred_workgroup_size = 0;
 
-    err = create_single_kernel_helper(context, &program, &kernel, 1, sample_local_arg_kernel, "sample_test" );
+    err = create_single_kernel_helper(context, &program, &kernel, 1,
+                                      sample_local_arg_kernel, "sample_test");
     test_error(err, "Failed to build kernel/program.");
 
     err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE,
-                                   sizeof(max_workgroup_size), &max_workgroup_size, NULL);
+                                   sizeof(max_workgroup_size),
+                                   &max_workgroup_size, NULL);
     test_error(err, "clGetKernelWorkgroupInfo failed.");
 
-    err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-                                   sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL);
+    err = clGetKernelWorkGroupInfo(
+        kernel, deviceID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+        sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL);
     test_error(err, "clGetKernelWorkgroupInfo failed.");
 
-    err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
+    err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                          sizeof(max_local_workgroup_size),
+                          max_local_workgroup_size, NULL);
     test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
 
-    // Since the preferred size is only a performance hint, we can only really check that we get a sane value
-    // back
-    log_info( "size: %ld     preferred: %ld      max: %ld\n", max_workgroup_size, preferred_workgroup_size, max_local_workgroup_size[0] );
+    // Since the preferred size is only a performance hint, we can only really
+    // check that we get a sane value back
+    log_info("size: %ld     preferred: %ld      max: %ld\n", max_workgroup_size,
+             preferred_workgroup_size, max_local_workgroup_size[0]);
 
-    if( preferred_workgroup_size > max_workgroup_size )
+    if (preferred_workgroup_size > max_workgroup_size)
     {
-        log_error( "ERROR: Reported preferred workgroup multiple larger than max workgroup size (preferred %ld, max %ld)\n", preferred_workgroup_size, max_workgroup_size );
+        log_error("ERROR: Reported preferred workgroup multiple larger than "
+                  "max workgroup size (preferred %ld, max %ld)\n",
+                  preferred_workgroup_size, max_workgroup_size);
         return -1;
     }
 
     return 0;
 }
 
-int test_min_max_execution_capabilities(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_execution_capabilities(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements)
 {
     int error;
     cl_device_exec_capabilities value;
 
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get execution capabilities" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_EXECUTION_CAPABILITIES,
+                            sizeof(value), &value, 0);
+    test_error(error, "Unable to get execution capabilities");
 
-    if( ( value & CL_EXEC_KERNEL ) != CL_EXEC_KERNEL )
+    if ((value & CL_EXEC_KERNEL) != CL_EXEC_KERNEL)
     {
-        log_error( "ERROR: Reported execution capabilities less than required by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+        log_error("ERROR: Reported execution capabilities less than required "
+                  "by OpenCL 1.0 (reported 0x%08x)\n",
+                  (int)value);
         return -1;
     }
     return 0;
 }
 
-int test_min_max_queue_properties(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_queue_properties(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     int error;
     cl_command_queue_properties value;
 
 
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, sizeof( value ), &value, 0 );
-    test_error( error, "Unable to get queue properties" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES,
+                            sizeof(value), &value, 0);
+    test_error(error, "Unable to get queue properties");
 
-    if( ( value & CL_QUEUE_PROFILING_ENABLE ) != CL_QUEUE_PROFILING_ENABLE )
+    if ((value & CL_QUEUE_PROFILING_ENABLE) != CL_QUEUE_PROFILING_ENABLE)
     {
-        log_error( "ERROR: Reported queue properties less than required by OpenCL 1.0 (reported 0x%08x)\n", (int)value );
+        log_error("ERROR: Reported queue properties less than required by "
+                  "OpenCL 1.0 (reported 0x%08x)\n",
+                  (int)value);
         return -1;
     }
     return 0;
 }
 
-int test_min_max_device_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_device_version(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
     // Query for the device version.
     Version device_cl_version = get_device_cl_version(deviceID);
@@ -1959,84 +2370,101 @@ int test_min_max_device_version(cl_device_id deviceID, cl_context context, cl_co
     return 0;
 }
 
-int test_min_max_language_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_min_max_language_version(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue, int num_elements)
 {
     cl_int error;
-    cl_char buffer[ 4098 ];
+    cl_char buffer[4098];
     size_t length;
 
     // Device version should fit the regex "OpenCL [0-9]+\.[0-9]+ *.*"
-    error = clGetDeviceInfo( deviceID, CL_DEVICE_OPENCL_C_VERSION, sizeof( buffer ), buffer, &length );
-    test_error( error, "Unable to get device opencl c version string" );
-    if( memcmp( buffer, "OpenCL C ", strlen( "OpenCL C " ) ) != 0 )
-    {
-        log_error( "ERROR: Initial part of device language version string does not match required format! (returned: \"%s\")\n", (char *)buffer );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_OPENCL_C_VERSION,
+                            sizeof(buffer), buffer, &length);
+    test_error(error, "Unable to get device opencl c version string");
+    if (memcmp(buffer, "OpenCL C ", strlen("OpenCL C ")) != 0)
+    {
+        log_error("ERROR: Initial part of device language version string does "
+                  "not match required format! (returned: \"%s\")\n",
+                  (char *)buffer);
         return -1;
     }
 
     log_info("Returned version \"%s\".\n", buffer);
 
-    char *p1 = (char *)buffer + strlen( "OpenCL C " );
-    while( *p1 == ' ' )
-        p1++;
+    char *p1 = (char *)buffer + strlen("OpenCL C ");
+    while (*p1 == ' ') p1++;
     char *p2 = p1;
-    if( ! isdigit(*p2) )
+    if (!isdigit(*p2))
     {
-        log_error( "ERROR: Major revision number must follow space behind OpenCL C! (returned %s)\n", (char*) buffer );
+        log_error("ERROR: Major revision number must follow space behind "
+                  "OpenCL C! (returned %s)\n",
+                  (char *)buffer);
         return -1;
     }
-    while( isdigit( *p2 ) )
-        p2++;
-    if( *p2 != '.' )
+    while (isdigit(*p2)) p2++;
+    if (*p2 != '.')
     {
-        log_error( "ERROR: Version number must contain a decimal point! (returned: %s)\n", (char *)buffer );
+        log_error("ERROR: Version number must contain a decimal point! "
+                  "(returned: %s)\n",
+                  (char *)buffer);
         return -1;
     }
     char *p3 = p2 + 1;
-    if( ! isdigit(*p3) )
+    if (!isdigit(*p3))
     {
-        log_error( "ERROR: Minor revision number is missing or does not abut the decimal point! (returned %s)\n", (char*) buffer );
+        log_error("ERROR: Minor revision number is missing or does not abut "
+                  "the decimal point! (returned %s)\n",
+                  (char *)buffer);
         return -1;
     }
-    while( isdigit( *p3 ) )
-        p3++;
-    if( *p3 != ' ' )
+    while (isdigit(*p3)) p3++;
+    if (*p3 != ' ')
     {
-        log_error( "ERROR: A space must appear after the minor version! (returned: %s)\n", (char *)buffer );
+        log_error("ERROR: A space must appear after the minor version! "
+                  "(returned: %s)\n",
+                  (char *)buffer);
         return -1;
     }
     *p2 = ' '; // Put in a space for atoi below.
     p2++;
 
-    int major = atoi( p1 );
-    int minor = atoi( p2 );
+    int major = atoi(p1);
+    int minor = atoi(p2);
     int minor_revision = 2;
 
-    if( major * 10 + minor < 10 + minor_revision )
+    if (major * 10 + minor < 10 + minor_revision)
     {
-        // If the language version did not match, check to see if OPENCL_1_0_DEVICE is set.
-        if( getenv("OPENCL_1_0_DEVICE"))
+        // If the language version did not match, check to see if
+        // OPENCL_1_0_DEVICE is set.
+        if (getenv("OPENCL_1_0_DEVICE"))
         {
-          log_info( "WARNING: This test was run with OPENCL_1_0_DEVICE defined!  This is not a OpenCL 1.1 or OpenCL 1.2 compatible device!!!\n" );
+            log_info("WARNING: This test was run with OPENCL_1_0_DEVICE "
+                     "defined!  This is not a OpenCL 1.1 or OpenCL 1.2 "
+                     "compatible device!!!\n");
         }
-        else if( getenv("OPENCL_1_1_DEVICE"))
+        else if (getenv("OPENCL_1_1_DEVICE"))
         {
-          log_info( "WARNING: This test was run with OPENCL_1_1_DEVICE defined!  This is not a OpenCL 1.2 compatible device!!!\n" );
+            log_info(
+                "WARNING: This test was run with OPENCL_1_1_DEVICE defined!  "
+                "This is not a OpenCL 1.2 compatible device!!!\n");
         }
         else
         {
-          log_error( "ERROR: OpenCL device language version returned is less than 1.%d! (Returned: %s)\n", minor_revision, (char *)buffer );
-          return -1;
+            log_error("ERROR: OpenCL device language version returned is less "
+                      "than 1.%d! (Returned: %s)\n",
+                      minor_revision, (char *)buffer);
+            return -1;
         }
     }
 
     // Sanity checks on the returned values
-    if( length != (strlen( (char *)buffer ) + 1 ))
+    if (length != (strlen((char *)buffer) + 1))
     {
-        log_error( "ERROR: Returned length of version string does not match actual length (actual: %d, returned: %d)\n", (int)strlen( (char *)buffer ), (int)length );
+        log_error("ERROR: Returned length of version string does not match "
+                  "actual length (actual: %d, returned: %d)\n",
+                  (int)strlen((char *)buffer), (int)length);
         return -1;
     }
 
     return 0;
 }
-

From 8ffecf27c28d28296180cde282e5665bc2cb2c00 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Wed, 8 Dec 2021 17:07:42 +0100
Subject: [PATCH 147/158] Fix build, glext should not be used with GLEW (#1337)

* Fix build, glext should not be used with GLEW

* Remove additional define GL_GLEXT_PROTOTYPES

* Remove includes which already defined in setup.h
---
 test_common/gl/setup_win32.cpp | 3 ---
 test_common/gl/setup_x11.cpp   | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/test_common/gl/setup_win32.cpp b/test_common/gl/setup_win32.cpp
index b120a36d6f..708e681d80 100644
--- a/test_common/gl/setup_win32.cpp
+++ b/test_common/gl/setup_win32.cpp
@@ -13,14 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#define GL_GLEXT_PROTOTYPES
 
 #include "setup.h"
 #include "testBase.h"
 #include "harness/errorHelpers.h"
 
-#include <GL/gl.h>
-#include <GL/glut.h>
 #include <CL/cl_ext.h>
 
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
diff --git a/test_common/gl/setup_x11.cpp b/test_common/gl/setup_x11.cpp
index 7efda3d2ac..abc065c94c 100644
--- a/test_common/gl/setup_x11.cpp
+++ b/test_common/gl/setup_x11.cpp
@@ -13,16 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#define GL_GLEXT_PROTOTYPES
 
 #include "setup.h"
 #include "testBase.h"
 #include "harness/errorHelpers.h"
 
-#include <GL/gl.h>
-#include <GL/glut.h>
-#include <GL/glext.h>
-#include <GL/freeglut.h>
 #include <GL/glx.h>
 #include <CL/cl_ext.h>
 

From 73d71b6a76ce9697c5224a0933157355302d5002 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Wed, 8 Dec 2021 16:08:15 +0000
Subject: [PATCH 148/158] Add cl_khr_command_buffer to list of extensions
 (#1365)

cl_khr_command_buffer is now public as a provisional khr extension
which implementations may report.
---
 .../compiler/test_compiler_defines_for_extensions.cpp            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index 2f29d39b68..1519779a14 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -78,6 +78,7 @@ const char *known_extensions[] = {
     "cl_khr_semaphore",
     "cl_khr_external_semaphore",
     "cl_khr_external_semaphore_sync_fd",
+    "cl_khr_command_buffer",
 };
 
 size_t num_known_extensions = sizeof(known_extensions) / sizeof(char *);

From 1161d788dd5d71885ca19783210f18c305715a7f Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Tue, 14 Dec 2021 17:52:44 +0000
Subject: [PATCH 149/158] Refactor logging of subgroup test start/pass messages
 (#1361)

Note that this also corrects the start messages logged for the
sub_group_ballot_bit_count/find_msb/find_lsb tests.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 .../subgroups/subgroup_common_templates.h     | 101 ++++++++++--------
 test_conformance/subgroups/subhelpers.h       |  50 ++++++---
 test_conformance/subgroups/test_barrier.cpp   |  16 ++-
 test_conformance/subgroups/test_ifp.cpp       |   8 +-
 test_conformance/subgroups/test_subgroup.cpp  |   9 +-
 .../subgroups/test_subgroup_ballot.cpp        |  48 ++++++---
 .../test_subgroup_clustered_reduce.cpp        |  14 +--
 .../test_subgroup_non_uniform_vote.cpp        |  16 +--
 8 files changed, 164 insertions(+), 98 deletions(-)

diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index 64b4b9718d..fc0b03b5da 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -63,6 +63,13 @@ static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
 // only 4 work_items from subgroup enter the code (are active)
 template <typename Ty, SubgroupsBroadcastOp operation> struct BC
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int i, ii, j, k, n;
@@ -76,8 +83,6 @@ template <typename Ty, SubgroupsBroadcastOp operation> struct BC
         int last_subgroup_size = 0;
         ii = 0;
 
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         if (non_uniform_size)
         {
             ng++;
@@ -286,8 +291,6 @@ template <typename Ty, SubgroupsBroadcastOp operation> struct BC
             y += nw;
             m += 4 * nw;
         }
-        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         return TEST_PASS;
     }
 };
@@ -437,6 +440,13 @@ void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
 
 template <typename Ty, ShuffleOp operation> struct SHF
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int i, ii, j, k, l, n, delta;
@@ -447,8 +457,6 @@ template <typename Ty, ShuffleOp operation> struct SHF
         int d = ns > 100 ? 100 : ns;
         ii = 0;
         ng = ng / nw;
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         for (k = 0; k < ng; ++k)
         { // for each work_group
             for (j = 0; j < nj; ++j)
@@ -560,26 +568,29 @@ template <typename Ty, ShuffleOp operation> struct SHF
             y += nw;
             m += 4 * nw;
         }
-        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         return TEST_PASS;
     }
 };
 
 template <typename Ty, ArithmeticOp operation> struct SCEX_NU
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_exclusive"
+                                     : "sub_group_scan_exclusive");
+        log_info("  %s_%s(%s)...%s\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        std::string func_name;
-        test_params.work_items_mask.any()
-            ? func_name = "sub_group_non_uniform_scan_exclusive"
-            : func_name = "sub_group_scan_exclusive";
-        log_info("  %s_%s(%s)...\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
@@ -595,11 +606,9 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
         Ty tr, rr;
         ng = ng / nw;
 
-        std::string func_name;
-        test_params.work_items_mask.any()
-            ? func_name = "sub_group_non_uniform_scan_exclusive"
-            : func_name = "sub_group_scan_exclusive";
-
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_exclusive"
+                                     : "sub_group_scan_exclusive");
 
         // for uniform case take into consideration all workitems
         if (!work_items_mask.any())
@@ -656,8 +665,6 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
             m += 4 * nw;
         }
 
-        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
     }
 };
@@ -665,20 +672,24 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
 // Test for scan inclusive non uniform functions
 template <typename Ty, ArithmeticOp operation> struct SCIN_NU
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_inclusive"
+                                     : "sub_group_scan_inclusive");
+        log_info("  %s_%s(%s)...%s\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        std::string func_name;
-        test_params.work_items_mask.any()
-            ? func_name = "sub_group_non_uniform_scan_inclusive"
-            : func_name = "sub_group_scan_inclusive";
-
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
-        log_info("  %s_%s(%s)...\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
     }
 
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -694,10 +705,9 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
         Ty tr, rr;
         ng = ng / nw;
 
-        std::string func_name;
-        work_items_mask.any()
-            ? func_name = "sub_group_non_uniform_scan_inclusive"
-            : func_name = "sub_group_scan_inclusive";
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_scan_inclusive"
+                                     : "sub_group_scan_inclusive");
 
         // for uniform case take into consideration all workitems
         if (!work_items_mask.any())
@@ -771,8 +781,6 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
             m += 4 * nw;
         }
 
-        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
     }
 };
@@ -780,6 +788,16 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
 // Test for reduce non uniform functions
 template <typename Ty, ArithmeticOp operation> struct RED_NU
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_reduce"
+                                     : "sub_group_reduce");
+        log_info("  %s_%s(%s)...%s\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 extra_text);
+    }
 
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
@@ -787,13 +805,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        std::string func_name;
-
-        test_params.work_items_mask.any()
-            ? func_name = "sub_group_non_uniform_reduce"
-            : func_name = "sub_group_reduce";
-        log_info("  %s_%s(%s)...\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
@@ -809,9 +820,9 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
         ng = ng / nw;
         Ty tr, rr;
 
-        std::string func_name;
-        work_items_mask.any() ? func_name = "sub_group_non_uniform_reduce"
-                              : func_name = "sub_group_reduce";
+        std::string func_name = (test_params.all_work_item_masks.size() > 0
+                                     ? "sub_group_non_uniform_reduce"
+                                     : "sub_group_reduce");
 
         for (k = 0; k < ng; ++k)
         {
@@ -875,8 +886,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
             m += 4 * nw;
         }
 
-        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
     }
 };
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index bd4b6d61e9..30105a574e 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -1380,23 +1380,45 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
                             const char *kname, const char *src,
                             WorkGroupParams test_params)
     {
+        Fns::log_test(test_params, "");
+
         test_status combined_error = TEST_SKIPPED_ITSELF;
         for (auto &mask : test_params.all_work_item_masks)
         {
             test_params.work_items_mask = mask;
-            test_status error = run(device, context, queue, num_elements, kname,
-                                    src, test_params);
+            test_status error = do_run(device, context, queue, num_elements,
+                                       kname, src, test_params);
 
             if (error == TEST_FAIL
                 || (error == TEST_PASS && combined_error != TEST_FAIL))
                 combined_error = error;
         }
+
+        if (combined_error == TEST_PASS)
+        {
+            Fns::log_test(test_params, " passed");
+        }
         return combined_error;
     };
-    static test_status run(cl_device_id device, cl_context context,
-                           cl_command_queue queue, int num_elements,
-                           const char *kname, const char *src,
-                           WorkGroupParams test_params)
+    static int run(cl_device_id device, cl_context context,
+                   cl_command_queue queue, int num_elements, const char *kname,
+                   const char *src, WorkGroupParams test_params)
+    {
+        Fns::log_test(test_params, "");
+
+        int error = do_run(device, context, queue, num_elements, kname, src,
+                           test_params);
+
+        if (error == TEST_PASS)
+        {
+            Fns::log_test(test_params, " passed");
+        }
+        return error;
+    };
+    static test_status do_run(cl_device_id device, cl_context context,
+                              cl_command_queue queue, int num_elements,
+                              const char *kname, const char *src,
+                              WorkGroupParams test_params)
     {
         size_t tmp;
         cl_int error;
@@ -1442,16 +1464,14 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
             log_info("Data type not supported : %s\n", TypeManager<Ty>::name());
             return TEST_SKIPPED_ITSELF;
         }
-        else
+
+        if (strstr(TypeManager<Ty>::name(), "double"))
+        {
+            kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
+        }
+        else if (strstr(TypeManager<Ty>::name(), "half"))
         {
-            if (strstr(TypeManager<Ty>::name(), "double"))
-            {
-                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
-            }
-            else if (strstr(TypeManager<Ty>::name(), "half"))
-            {
-                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n";
-            }
+            kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n";
         }
 
         error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
diff --git a/test_conformance/subgroups/test_barrier.cpp b/test_conformance/subgroups/test_barrier.cpp
index b570e92292..d415eefbb0 100644
--- a/test_conformance/subgroups/test_barrier.cpp
+++ b/test_conformance/subgroups/test_barrier.cpp
@@ -59,6 +59,17 @@ static const char *gbar_source =
 // barrier test functions
 template <int Which> struct BAR
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        if (Which == 0)
+            log_info("  sub_group_barrier(CLK_LOCAL_MEM_FENCE)...%s\n",
+                     extra_text);
+        else
+            log_info("  sub_group_barrier(CLK_GLOBAL_MEM_FENCE)...%s\n",
+                     extra_text);
+    }
+
     static void gen(cl_int *x, cl_int *t, cl_int *m,
                     const WorkGroupParams &test_params)
     {
@@ -103,11 +114,6 @@ template <int Which> struct BAR
         ng = ng / nw;
         cl_int tr, rr;
 
-        if (Which == 0)
-            log_info("  sub_group_barrier(CLK_LOCAL_MEM_FENCE)...\n");
-        else
-            log_info("  sub_group_barrier(CLK_GLOBAL_MEM_FENCE)...\n");
-
         for (k = 0; k < ng; ++k)
         {
             // Map to array indexed to array indexed by local ID and sub group
diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp
index f6c5227dfc..f2bd5b9257 100644
--- a/test_conformance/subgroups/test_ifp.cpp
+++ b/test_conformance/subgroups/test_ifp.cpp
@@ -225,6 +225,12 @@ void run_insts(cl_int *x, cl_int *p, int n)
 
 struct IFP
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  independent forward progress...%s\n", extra_text);
+    }
+
     static void gen(cl_int *x, cl_int *t, cl_int *,
                     const WorkGroupParams &test_params)
     {
@@ -258,8 +264,6 @@ struct IFP
         // We need at least 2 sub groups per group for this test
         if (nj == 1) return TEST_SKIPPED_ITSELF;
 
-        log_info("  independent forward progress...\n");
-
         for (k = 0; k < ng; ++k)
         {
             run_insts(x, t, nj);
diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp
index eefca5f8a4..aa9b32cbb5 100644
--- a/test_conformance/subgroups/test_subgroup.cpp
+++ b/test_conformance/subgroups/test_subgroup.cpp
@@ -24,6 +24,13 @@ namespace {
 // Any/All test functions
 template <NonUniformVoteOp operation> struct AA
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s...%s\n", operation_names(operation),
+                 extra_text);
+    }
+
     static void gen(cl_int *x, cl_int *t, cl_int *m,
                     const WorkGroupParams &test_params)
     {
@@ -35,7 +42,6 @@ template <NonUniformVoteOp operation> struct AA
         int e;
         ng = ng / nw;
         ii = 0;
-        log_info("  sub_group_%s...\n", operation_names(operation));
         for (k = 0; k < ng; ++k)
         {
             for (j = 0; j < nj; ++j)
@@ -124,7 +130,6 @@ template <NonUniformVoteOp operation> struct AA
             y += nw;
             m += 4 * nw;
         }
-        log_info("  sub_group_%s... passed\n", operation_names(operation));
         return TEST_PASS;
     }
 };
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index e742aa3b64..837988ea1f 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -23,6 +23,12 @@ namespace {
 // Test for ballot functions
 template <typename Ty> struct BALLOT
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_ballot...%s\n", extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         // no work here
@@ -30,7 +36,6 @@ template <typename Ty> struct BALLOT
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
         int non_uniform_size = gws % lws;
-        log_info("  sub_group_ballot...\n");
     }
 
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -92,7 +97,6 @@ template <typename Ty> struct BALLOT
             y += lws;
             m += 4 * lws;
         }
-        log_info("  sub_group_ballot... passed\n");
         return TEST_PASS;
     }
 };
@@ -100,6 +104,13 @@ template <typename Ty> struct BALLOT
 // Test for bit extract ballot functions
 template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_ballot_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int wi_id, sb_id, wg_id, l;
@@ -110,8 +121,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
         int wg_number = gws / lws;
         int limit_sbs = sbs > 100 ? 100 : sbs;
         int non_uniform_size = gws % lws;
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
 
         for (wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
@@ -251,21 +260,24 @@ template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
             y += lws;
             m += 4 * lws;
         }
-        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         return TEST_PASS;
     }
 };
 
 template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_inverse_ballot...%s\n", extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int gws = test_params.global_workgroup_size;
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
         int non_uniform_size = gws % lws;
-        log_info("  sub_group_inverse_ballot...\n");
         // no work here
     }
 
@@ -341,7 +353,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
             m += 4 * lws;
         }
 
-        log_info("  sub_group_inverse_ballot... passed\n");
         return TEST_PASS;
     }
 };
@@ -350,6 +361,13 @@ template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
 // Test for bit count/inclusive and exclusive scan/ find lsb msb ballot function
 template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s(%s)...%s\n", operation_names(operation),
+                 TypeManager<Ty>::name(), extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int wi_id, wg_id, sb_id;
@@ -362,8 +380,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
         int last_subgroup_size = 0;
         int current_sbs = 0;
 
-        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
-                 TypeManager<Ty>::name());
         if (non_uniform_size)
         {
             wg_number++;
@@ -562,8 +578,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
             y += lws;
             m += 4 * lws;
         }
-        log_info("  sub_group_ballot_%s(%s)... passed\n",
-                 operation_names(operation), TypeManager<Ty>::name());
         return TEST_PASS;
     }
 };
@@ -571,6 +585,13 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
 // test mask functions
 template <typename Ty, BallotOp operation> struct SMASK
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  get_sub_group_%s_mask...%s\n", operation_names(operation),
+                 extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int wi_id, wg_id, l, sb_id;
@@ -579,7 +600,6 @@ template <typename Ty, BallotOp operation> struct SMASK
         int sbs = test_params.subgroup_size;
         int sb_number = (lws + sbs - 1) / sbs;
         int wg_number = gws / lws;
-        log_info("  get_sub_group_%s_mask...\n", operation_names(operation));
         for (wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
             for (sb_id = 0; sb_id < sb_number; ++sb_id)
@@ -655,8 +675,6 @@ template <typename Ty, BallotOp operation> struct SMASK
             y += lws;
             m += 4 * lws;
         }
-        log_info("  get_sub_group_%s_mask... passed\n",
-                 operation_names(operation));
         return TEST_PASS;
     }
 };
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
index ad9e1ff228..f5872006a9 100644
--- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -38,15 +38,20 @@ __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type
 // Test for reduce cluster functions
 template <typename Ty, ArithmeticOp operation> struct RED_CLU
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ...%s\n",
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 sizeof(Ty), extra_text);
+    }
+
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int nw = test_params.local_workgroup_size;
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ...\n",
-                 operation_names(operation), TypeManager<Ty>::name(),
-                 sizeof(Ty));
         genrand<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
@@ -124,9 +129,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_CLU
             y += nw;
             m += 4 * nw;
         }
-        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n",
-                 operation_names(operation), TypeManager<Ty>::name(),
-                 sizeof(Ty));
         return TEST_PASS;
     }
 };
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
index b21a9f7eed..3f0985e26b 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -22,6 +22,15 @@ namespace {
 
 template <typename T, NonUniformVoteOp operation> struct VOTE
 {
+    static void log_test(const WorkGroupParams &test_params,
+                         const char *extra_text)
+    {
+        log_info("  sub_group_%s%s(%s)...%s\n",
+                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
+                 operation_names(operation), TypeManager<T>::name(),
+                 extra_text);
+    }
+
     static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params)
     {
         int i, ii, j, k, n;
@@ -34,10 +43,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
         int last_subgroup_size = 0;
         ii = 0;
 
-        log_info("  sub_group_%s%s(%s)... \n",
-                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
-                 operation_names(operation), TypeManager<T>::name());
-
         if (operation == NonUniformVoteOp::elect) return;
 
         for (k = 0; k < ng; ++k)
@@ -192,9 +197,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
             m += 4 * nw;
         }
 
-        log_info("  sub_group_%s%s(%s)... passed\n",
-                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
-                 operation_names(operation), TypeManager<T>::name());
         return TEST_PASS;
     }
 };

From c2facedfa0a0e07f7602cfecae90392419c0e159 Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Wed, 5 Jan 2022 08:43:50 -0700
Subject: [PATCH 150/158] Remove dead threading code (#1339)

Remove unused code that hasn't been used for the last three years
and isn't included in makefiles.

Co-authored-by: oramirez <oramirez@qti.qualcomm.com>
---
 test_common/harness/threadTesting.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 test_common/harness/threadTesting.cpp

diff --git a/test_common/harness/threadTesting.cpp b/test_common/harness/threadTesting.cpp
deleted file mode 100644
index e69de29bb2..0000000000

From b71c2047943a44a2e99c367e406e680caa160bfe Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Wed, 5 Jan 2022 17:08:52 +0100
Subject: [PATCH 151/158] test_subgroups - Set safe input values for half type
 and mul, add operations (#1346)

* Set safe input values for half type and mul, add operations

* Set safe values for all data types

* Typo fix

* Set constant seed for shuffle

* Change function name to more specific

* set_value takes an integer value, not a bit pattern
---
 .../subgroups/subgroup_common_templates.h     | 48 +++++++++++++++----
 .../test_subgroup_clustered_reduce.cpp        |  2 +-
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
index fc0b03b5da..641c187585 100644
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -20,6 +20,8 @@
 #include "CL/cl_half.h"
 #include "subhelpers.h"
 #include <set>
+#include <algorithm>
+#include <random>
 
 static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
                                   const std::string &mask_type,
@@ -391,11 +393,44 @@ template <typename Ty> bool is_floating_point()
         || std::is_same<Ty, subgroups::cl_half>::value;
 }
 
+// limit possible input values to avoid arithmetic rounding/overflow issues.
+// for each subgroup values defined different values
+// for rest of workitems set 1
+// shuffle values
+static void fill_and_shuffle_safe_values(std::vector<cl_ulong> &safe_values,
+                                         int sb_size)
+{
+    // max product is 720, cl_half has enough precision for it
+    const std::vector<cl_ulong> non_one_values{ 2, 3, 4, 5, 6 };
+
+    if (sb_size <= non_one_values.size())
+    {
+        safe_values.assign(non_one_values.begin(),
+                           non_one_values.begin() + sb_size);
+    }
+    else
+    {
+        safe_values.assign(sb_size, 1);
+        std::copy(non_one_values.begin(), non_one_values.end(),
+                  safe_values.begin());
+    }
+
+    std::mt19937 mersenne_twister_engine(10000);
+    std::shuffle(safe_values.begin(), safe_values.end(),
+                 mersenne_twister_engine);
+};
+
 template <typename Ty, ArithmeticOp operation>
-void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+void generate_inputs(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
 {
     int nj = (nw + ns - 1) / ns;
 
+    std::vector<cl_ulong> safe_values;
+    if (operation == ArithmeticOp::mul_ || operation == ArithmeticOp::add_)
+    {
+        fill_and_shuffle_safe_values(safe_values, ns);
+    }
+
     for (int k = 0; k < ng; ++k)
     {
         for (int j = 0; j < nj; ++j)
@@ -406,13 +441,10 @@ void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
             for (int i = 0; i < n; ++i)
             {
                 cl_ulong out_value;
-                double y;
                 if (operation == ArithmeticOp::mul_
                     || operation == ArithmeticOp::add_)
                 {
-                    // work around to avoid overflow, do not use 0 for
-                    // multiplication
-                    out_value = (genrand_int32(gMTdata) % 4) + 1;
+                    out_value = safe_values[i];
                 }
                 else
                 {
@@ -591,7 +623,7 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -689,7 +721,7 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -805,7 +837,7 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
index f5872006a9..527be5ad5c 100644
--- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -52,7 +52,7 @@ template <typename Ty, ArithmeticOp operation> struct RED_CLU
         int ns = test_params.subgroup_size;
         int ng = test_params.global_workgroup_size;
         ng = ng / nw;
-        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
     }
 
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,

From f91daf3d062d7d085bd9e9154869d2179655685f Mon Sep 17 00:00:00 2001
From: Jim Lewis <j.lewis1@samsung.com>
Date: Thu, 6 Jan 2022 04:23:07 -0600
Subject: [PATCH 152/158] Remove invalid negative_get_platform_info testcase
 (#1374)

* Remove invalid negative_get_platform_info testcase

* Implementations are only required to do null checks
* Fixes #1318

* Fix formatting
---
 test_conformance/api/negative_platform.cpp | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/test_conformance/api/negative_platform.cpp b/test_conformance/api/negative_platform.cpp
index 7d9de5df87..861d47484e 100644
--- a/test_conformance/api/negative_platform.cpp
+++ b/test_conformance/api/negative_platform.cpp
@@ -42,18 +42,9 @@ int test_negative_get_platform_info(cl_device_id deviceID, cl_context context,
 {
     cl_platform_id platform = getPlatformFromDevice(deviceID);
 
-    cl_int err =
-        clGetPlatformInfo(reinterpret_cast<cl_platform_id>(deviceID),
-                          CL_PLATFORM_VERSION, sizeof(char*), nullptr, nullptr);
-    test_failure_error_ret(
-        err, CL_INVALID_PLATFORM,
-        "clGetPlatformInfo should return CL_INVALID_PLATFORM  when: \"platform "
-        "is not a valid platform\" using a valid object which is NOT a "
-        "platform",
-        TEST_FAIL);
-
     constexpr cl_platform_info INVALID_PARAM_VALUE = 0;
-    err = clGetPlatformInfo(platform, INVALID_PARAM_VALUE, 0, nullptr, nullptr);
+    cl_int err =
+        clGetPlatformInfo(platform, INVALID_PARAM_VALUE, 0, nullptr, nullptr);
     test_failure_error_ret(
         err, CL_INVALID_VALUE,
         "clGetPlatformInfo should return CL_INVALID_VALUE when: \"param_name "

From 51c6d97d2f9d62e5bdcbc1f4cbec2d5be2bedf0a Mon Sep 17 00:00:00 2001
From: Jim Lewis <j.lewis1@samsung.com>
Date: Thu, 6 Jan 2022 04:26:20 -0600
Subject: [PATCH 153/158] Fix test_api get_command_queue_info (#1324)

* Fix test_api get_command_queue_info

Decouple host and device out-of-order test enabling

* Rename property sets more generically

* Refactor to use std::vector to accumulate test permutations
---
 test_conformance/api/test_queries.cpp | 127 ++++++++++++++------------
 1 file changed, 70 insertions(+), 57 deletions(-)

diff --git a/test_conformance/api/test_queries.cpp b/test_conformance/api/test_queries.cpp
index 469a19349a..30b5706f6b 100644
--- a/test_conformance/api/test_queries.cpp
+++ b/test_conformance/api/test_queries.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 #include <ctype.h>
 #include <algorithm>
+#include <vector>
 
 int test_get_platform_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
@@ -345,87 +346,100 @@ int command_queue_param_test(cl_command_queue queue,
     return 0;
 }
 
-#define MIN_NUM_COMMAND_QUEUE_PROPERTIES 2
-#define OOO_NUM_COMMAND_QUEUE_PROPERTIES 4
-static cl_command_queue_properties property_options[] = {
-    0,
-
-    CL_QUEUE_PROFILING_ENABLE,
-
-    CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-
-    CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-
-    CL_QUEUE_ON_DEVICE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-
-    CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE
-        | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-
-    CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT
-        | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-
-    CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT
-        | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
-};
-
 int check_get_command_queue_info_params(cl_device_id deviceID,
                                         cl_context context,
                                         bool is_compatibility)
 {
-    int error;
-    size_t size;
+    const cl_command_queue_properties host_optional[] = {
+        CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+        CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+    };
+
+    const cl_command_queue_properties device_required[] = {
+        CL_QUEUE_ON_DEVICE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+        CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE
+            | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+        CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT
+            | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+        CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE
+            | CL_QUEUE_ON_DEVICE_DEFAULT
+            | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+    };
+
+    const size_t host_optional_size = ARRAY_SIZE(host_optional);
+    const size_t device_required_size = ARRAY_SIZE(device_required);
+
+    Version version = get_device_cl_version(deviceID);
 
-    cl_queue_properties host_queue_props, device_queue_props;
-    cl_queue_properties queue_props[] = { CL_QUEUE_PROPERTIES, 0, 0 };
+    const cl_device_info host_queue_query = version >= Version(2, 0)
+        ? CL_DEVICE_QUEUE_ON_HOST_PROPERTIES
+        : CL_DEVICE_QUEUE_PROPERTIES;
 
-    clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES,
-                    sizeof(host_queue_props), &host_queue_props, NULL);
-    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n",
-             (int)host_queue_props);
-    clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES,
-                    sizeof(device_queue_props), &device_queue_props, NULL);
-    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n",
-             (int)device_queue_props);
+    cl_queue_properties host_queue_props = 0;
+    int error =
+        clGetDeviceInfo(deviceID, host_queue_query, sizeof(host_queue_props),
+                        &host_queue_props, NULL);
+    test_error(error, "clGetDeviceInfo failed");
+    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n", host_queue_props);
 
-    auto version = get_device_cl_version(deviceID);
+    cl_queue_properties device_queue_props = 0;
+    if (version >= Version(2, 0))
+    {
+        error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES,
+                                sizeof(device_queue_props), &device_queue_props,
+                                NULL);
+        test_error(error, "clGetDeviceInfo failed");
+        log_info("CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES is %d\n",
+                 device_queue_props);
+    }
+
+    bool out_of_order_supported =
+        host_queue_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
 
-    // Are on device queues supported
     bool on_device_supported =
         (version >= Version(2, 0) && version < Version(3, 0))
         || (version >= Version(3, 0) && device_queue_props != 0);
 
-    int num_test_options = MIN_NUM_COMMAND_QUEUE_PROPERTIES;
-    if (host_queue_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    // test device queues if the device and the API under test support it
+    bool test_on_device = on_device_supported && !is_compatibility;
+
+    std::vector<cl_queue_properties> queue_props{ 0,
+                                                  CL_QUEUE_PROFILING_ENABLE };
+
+    if (out_of_order_supported)
     {
-        // Test out-of-order queues properties if supported
-        num_test_options = OOO_NUM_COMMAND_QUEUE_PROPERTIES;
-    }
-    if (on_device_supported && !is_compatibility)
+        queue_props.insert(queue_props.end(), &host_optional[0],
+                           &host_optional[host_optional_size]);
+    };
+
+    cl_queue_properties queue_props_arg[] = { CL_QUEUE_PROPERTIES, 0, 0 };
+
+    if (test_on_device)
     {
-        // Test queue on device if supported (in this case out-of-order must
-        // also be supported)
-        num_test_options = ARRAY_SIZE(property_options);
-    }
+        queue_props.insert(queue_props.end(), &device_required[0],
+                           &device_required[device_required_size]);
+    };
 
-    for (int i = 0; i < num_test_options; i++)
+    for (cl_queue_properties props : queue_props)
     {
-        queue_props[1] = property_options[i];
-        clCommandQueueWrapper queue;
 
+        queue_props_arg[1] = props;
+
+        clCommandQueueWrapper queue;
         if (is_compatibility)
         {
-            queue =
-                clCreateCommandQueue(context, deviceID, queue_props[1], &error);
+            queue = clCreateCommandQueue(context, deviceID, props, &error);
             test_error(error, "Unable to create command queue to test with");
         }
         else
         {
             queue = clCreateCommandQueueWithProperties(context, deviceID,
-                                                       &queue_props[0], &error);
+                                                       queue_props_arg, &error);
             test_error(error, "Unable to create command queue to test with");
         }
 
         cl_uint refCount;
+        size_t size;
         error = clGetCommandQueueInfo(queue, CL_QUEUE_REFERENCE_COUNT,
                                       sizeof(refCount), &refCount, &size);
         test_error(error, "Unable to get command queue reference count");
@@ -442,11 +456,12 @@ int check_get_command_queue_info_params(cl_device_id deviceID,
         test_error(error, "param checking failed");
 
         error = command_queue_param_test(queue, CL_QUEUE_PROPERTIES,
-                                         queue_props[1], "properties");
+                                         queue_props_arg[1], "properties");
         test_error(error, "param checking failed");
     }
     return 0;
 }
+
 int test_get_command_queue_info(cl_device_id deviceID, cl_context context,
                                 cl_command_queue ignoreQueue, int num_elements)
 {
@@ -824,5 +839,3 @@ int test_kernel_required_group_size(cl_device_id deviceID, cl_context context, c
 
     return 0;
 }
-
-

From 06415f8b79c38bb08279c8267d38b41101f32760 Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Tue, 11 Jan 2022 09:52:11 -0700
Subject: [PATCH 154/158] Fix memory leaks (#1378)

* Fix memory leaks

Fixed memory leaks in: buffers, basic, and vectors

* Formatting fixes

Co-authored-by: oramirez <oramirez@qti.qualcomm.com>
---
 .../basic/test_vector_swizzle.cpp             | 58 +++++++++++--------
 test_conformance/buffers/test_buffer_fill.cpp |  4 +-
 test_conformance/buffers/test_buffer_read.cpp |  4 +-
 test_conformance/vectors/test_step.cpp        |  2 +
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/test_conformance/basic/test_vector_swizzle.cpp b/test_conformance/basic/test_vector_swizzle.cpp
index 5ab3ea4fd2..884bcf36ff 100644
--- a/test_conformance/basic/test_vector_swizzle.cpp
+++ b/test_conformance/basic/test_vector_swizzle.cpp
@@ -610,9 +610,6 @@ static int test_vectype(const char* type_name, cl_device_id device,
     cl_int error = CL_SUCCESS;
     int result = TEST_PASS;
 
-    clProgramWrapper program;
-    clKernelWrapper kernel;
-
     std::string buildOptions{ "-DTYPE=" };
     buildOptions += type_name;
     buildOptions += std::to_string(N);
@@ -628,35 +625,50 @@ static int test_vectype(const char* type_name, cl_device_id device,
     makeReference<T, N, S>(reference);
 
     // XYZW swizzles:
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
 
-    const char* xyzw_source = TestInfo<N>::kernel_source_xyzw;
-    error = create_single_kernel_helper(
-        context, &program, &kernel, 1, &xyzw_source, "test_vector_swizzle_xyzw",
-        buildOptions.c_str());
-    test_error(error, "Unable to create xyzw test kernel");
+        const char* xyzw_source = TestInfo<N>::kernel_source_xyzw;
+        error = create_single_kernel_helper(
+            context, &program, &kernel, 1, &xyzw_source,
+            "test_vector_swizzle_xyzw", buildOptions.c_str());
+        test_error(error, "Unable to create xyzw test kernel");
 
-    result |= test_vectype_case(value, reference, context, kernel, queue);
+        result |= test_vectype_case(value, reference, context, kernel, queue);
+    }
 
     // sN swizzles:
-    const char* sN_source = TestInfo<N>::kernel_source_sN;
-    error = create_single_kernel_helper(context, &program, &kernel, 1,
-                                        &sN_source, "test_vector_swizzle_sN",
-                                        buildOptions.c_str());
-    test_error(error, "Unable to create sN test kernel");
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        const char* sN_source = TestInfo<N>::kernel_source_sN;
+        error = create_single_kernel_helper(
+            context, &program, &kernel, 1, &sN_source, "test_vector_swizzle_sN",
+            buildOptions.c_str());
+        test_error(error, "Unable to create sN test kernel");
 
-    result |= test_vectype_case(value, reference, context, kernel, queue);
+        result |= test_vectype_case(value, reference, context, kernel, queue);
+    }
 
     // RGBA swizzles for OpenCL 3.0 and newer:
-    const Version device_version = get_device_cl_version(device);
-    if (device_version >= Version(3, 0))
     {
-        const char* rgba_source = TestInfo<N>::kernel_source_rgba;
-        error = create_single_kernel_helper(
-            context, &program, &kernel, 1, &rgba_source,
-            "test_vector_swizzle_rgba", buildOptions.c_str());
-        test_error(error, "Unable to create rgba test kernel");
+        clProgramWrapper program;
+        clKernelWrapper kernel;
 
-        result |= test_vectype_case(value, reference, context, kernel, queue);
+        const Version device_version = get_device_cl_version(device);
+        if (device_version >= Version(3, 0))
+        {
+            const char* rgba_source = TestInfo<N>::kernel_source_rgba;
+            error = create_single_kernel_helper(
+                context, &program, &kernel, 1, &rgba_source,
+                "test_vector_swizzle_rgba", buildOptions.c_str());
+            test_error(error, "Unable to create rgba test kernel");
+
+            result |=
+                test_vectype_case(value, reference, context, kernel, queue);
+        }
     }
 
     return result;
diff --git a/test_conformance/buffers/test_buffer_fill.cpp b/test_conformance/buffers/test_buffer_fill.cpp
index 9c9c7d17ef..92079794fd 100644
--- a/test_conformance/buffers/test_buffer_fill.cpp
+++ b/test_conformance/buffers/test_buffer_fill.cpp
@@ -703,8 +703,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu
 int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
 {
     TestStruct pattern;
-    clProgramWrapper program;
-    clKernelWrapper kernel;
     size_t      ptrSize = sizeof( TestStruct );
     size_t      global_work_size[3];
     int         n, err;
@@ -720,6 +718,8 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma
 
     for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
     {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
         log_info("Testing with cl_mem_flags: %s\n",
                  flag_set_names[src_flag_id]);
 
diff --git a/test_conformance/buffers/test_buffer_read.cpp b/test_conformance/buffers/test_buffer_read.cpp
index 39cf3297e0..49a57f9281 100644
--- a/test_conformance/buffers/test_buffer_read.cpp
+++ b/test_conformance/buffers/test_buffer_read.cpp
@@ -763,7 +763,6 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
 {
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
-    clEventWrapper event;
     void        *outptr[5];
     void        *inptr[5];
     size_t      global_work_size[3];
@@ -805,6 +804,7 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
             clMemWrapper buffer;
+            clEventWrapper event;
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
@@ -900,7 +900,6 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
 {
     clProgramWrapper program[5];
     clKernelWrapper kernel[5];
-    clEventWrapper event;
     void        *outptr[5], *inptr[5];
     size_t      global_work_size[3];
     cl_int      err;
@@ -941,6 +940,7 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c
         for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++)
         {
             clMemWrapper buffer;
+            clEventWrapper event;
             outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment);
             if ( ! outptr[i] ){
                 log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) );
diff --git a/test_conformance/vectors/test_step.cpp b/test_conformance/vectors/test_step.cpp
index 2f6ad187a0..089bad2f3a 100644
--- a/test_conformance/vectors/test_step.cpp
+++ b/test_conformance/vectors/test_step.cpp
@@ -172,6 +172,8 @@ int test_step_internal(cl_device_id deviceID, cl_context context,
                 destroyClState(pClState);
                 return -1;
             }
+
+            clStateDestroyProgramAndKernel(pClState);
         }
     }
 

From 656886030b294225b92379ef14306b2e5b9a3f04 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Wed, 19 Jan 2022 14:17:54 +0000
Subject: [PATCH 155/158] Refactor divergence mask handling in subgroup tests
 (#1379)

This changes compilation of subgroup test kernels so that a separate
compilation is no longer performed for each divergence mask value.

The divergence mask is now passed as a kernel argument.

This also fixes all subgroup_functions_non_uniform_arithmetic testing
and the sub_group_elect and sub_group_any/all_equal subtests of the
subgroup_functions_non_uniform_vote test to use the correct order of
vector components for GPUs with a subgroup size greater than 64.

The conversion of divergence mask bitsets to uint4 vectors has been
corrected to match code comments in WorkGroupParams::load_masks()
in test_conformance/subgroups/subhelpers.h.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_conformance/subgroups/subhelpers.h       | 172 +++++++++---------
 .../test_subgroup_non_uniform_arithmetic.cpp  |   8 +-
 .../test_subgroup_non_uniform_vote.cpp        |  14 +-
 3 files changed, 96 insertions(+), 98 deletions(-)

diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index 30105a574e..aa4abc967e 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -34,12 +34,24 @@ extern MTdata gMTdata;
 typedef std::bitset<128> bs128;
 extern cl_half_rounding_mode g_rounding_mode;
 
+static cl_uint4 bs128_to_cl_uint4(bs128 v)
+{
+    bs128 bs128_ffffffff = 0xffffffffU;
+
+    cl_uint4 r;
+    r.s0 = ((v >> 0) & bs128_ffffffff).to_ulong();
+    r.s1 = ((v >> 32) & bs128_ffffffff).to_ulong();
+    r.s2 = ((v >> 64) & bs128_ffffffff).to_ulong();
+    r.s3 = ((v >> 96) & bs128_ffffffff).to_ulong();
+
+    return r;
+}
+
 struct WorkGroupParams
 {
-    WorkGroupParams(size_t gws, size_t lws,
-                    bool use_mask = false)
+    WorkGroupParams(size_t gws, size_t lws, int dm_arg = -1)
         : global_workgroup_size(gws), local_workgroup_size(lws),
-          use_masks(use_mask)
+          divergence_mask_arg(dm_arg)
     {
         subgroup_size = 0;
         work_items_mask = 0;
@@ -54,7 +66,7 @@ struct WorkGroupParams
     int dynsc;
     bool use_core_subgroups;
     std::vector<bs128> all_work_item_masks;
-    bool use_masks;
+    int divergence_mask_arg;
     void save_kernel_source(const std::string &source, std::string name = "")
     {
         if (name == "")
@@ -84,7 +96,7 @@ struct WorkGroupParams
     std::map<std::string, std::string> kernel_function_name;
     void load_masks()
     {
-        if (use_masks)
+        if (divergence_mask_arg != -1)
         {
             // 1 in string will be set 1, 0 will be set 0
             bs128 mask_0xf0f0f0f0("11110000111100001111000011110000"
@@ -1375,50 +1387,10 @@ static int run_kernel(cl_context context, cl_command_queue queue,
 // Driver for testing a single built in function
 template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
 {
-    static test_status mrun(cl_device_id device, cl_context context,
-                            cl_command_queue queue, int num_elements,
-                            const char *kname, const char *src,
-                            WorkGroupParams test_params)
-    {
-        Fns::log_test(test_params, "");
-
-        test_status combined_error = TEST_SKIPPED_ITSELF;
-        for (auto &mask : test_params.all_work_item_masks)
-        {
-            test_params.work_items_mask = mask;
-            test_status error = do_run(device, context, queue, num_elements,
-                                       kname, src, test_params);
-
-            if (error == TEST_FAIL
-                || (error == TEST_PASS && combined_error != TEST_FAIL))
-                combined_error = error;
-        }
-
-        if (combined_error == TEST_PASS)
-        {
-            Fns::log_test(test_params, " passed");
-        }
-        return combined_error;
-    };
-    static int run(cl_device_id device, cl_context context,
-                   cl_command_queue queue, int num_elements, const char *kname,
-                   const char *src, WorkGroupParams test_params)
-    {
-        Fns::log_test(test_params, "");
-
-        int error = do_run(device, context, queue, num_elements, kname, src,
-                           test_params);
-
-        if (error == TEST_PASS)
-        {
-            Fns::log_test(test_params, " passed");
-        }
-        return error;
-    };
-    static test_status do_run(cl_device_id device, cl_context context,
-                              cl_command_queue queue, int num_elements,
-                              const char *kname, const char *src,
-                              WorkGroupParams test_params)
+    static test_status run(cl_device_id device, cl_context context,
+                           cl_command_queue queue, int num_elements,
+                           const char *kname, const char *src,
+                           WorkGroupParams test_params)
     {
         size_t tmp;
         cl_int error;
@@ -1436,25 +1408,8 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
         std::vector<Ty> mapout;
         mapout.resize(local);
         std::stringstream kernel_sstr;
-        if (test_params.use_masks)
-        {
-            // Prapare uint4 type to store bitmask on kernel OpenCL C side
-            // To keep order the first characet in string is the lowest bit
-            // there was a need to give such offset to bitset constructor
-            // (first highest offset = 96)
-            std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(),
-                                      96, 32);
-            std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(),
-                                       64, 32);
-            std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(),
-                                       32, 32);
-            std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(),
-                                        0, 32);
-            kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b"
-                        << bits_1_32 << ",0b" << bits_33_64 << ",0b"
-                        << bits_65_96 << ",0b" << bits_97_128 << ");\n";
-        }
 
+        Fns::log_test(test_params, "");
 
         kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS ";
         kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n";
@@ -1563,6 +1518,18 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
         idata.resize(input_array_size);
         odata.resize(output_array_size);
 
+        if (test_params.divergence_mask_arg != -1)
+        {
+            cl_uint4 mask_vector;
+            mask_vector.x = 0xffffffffU;
+            mask_vector.y = 0xffffffffU;
+            mask_vector.z = 0xffffffffU;
+            mask_vector.w = 0xffffffffU;
+            error = clSetKernelArg(kernel, test_params.divergence_mask_arg,
+                                   sizeof(cl_uint4), &mask_vector);
+            test_error_fail(error, "Unable to set divergence mask argument");
+        }
+
         // Run the kernel once on zeroes to get the map
         memset(idata.data(), 0, input_array_size * sizeof(Ty));
         error = run_kernel(context, queue, kernel, global, local, idata.data(),
@@ -1572,25 +1539,65 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
         test_error_fail(error, "Running kernel first time failed");
 
         // Generate the desired input for the kernel
-
         test_params.subgroup_size = subgroup_size;
         Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params);
-        error = run_kernel(context, queue, kernel, global, local, idata.data(),
+
+        test_status combined_status;
+
+        if (test_params.divergence_mask_arg != -1)
+        {
+            combined_status = TEST_SKIPPED_ITSELF;
+
+            for (auto &mask : test_params.all_work_item_masks)
+            {
+                test_params.work_items_mask = mask;
+                cl_uint4 mask_vector = bs128_to_cl_uint4(mask);
+                clSetKernelArg(kernel, test_params.divergence_mask_arg,
+                               sizeof(cl_uint4), &mask_vector);
+                error = run_kernel(context, queue, kernel, global, local,
+                                   idata.data(), input_array_size * sizeof(Ty),
+                                   sgmap.data(), global * sizeof(cl_int4),
+                                   odata.data(), output_array_size * sizeof(Ty),
+                                   TSIZE * sizeof(Ty));
+                test_error_fail(error, "Running kernel second time failed");
+
+                // Check the result
+                test_status status =
+                    Fns::chk(idata.data(), odata.data(), mapin.data(),
+                             mapout.data(), sgmap.data(), test_params);
+
+                if (status == TEST_FAIL
+                    || (status == TEST_PASS && combined_status != TEST_FAIL))
+                    combined_status = status;
+
+                if (status == TEST_FAIL) break;
+            }
+        }
+        else
+        {
+            error =
+                run_kernel(context, queue, kernel, global, local, idata.data(),
                            input_array_size * sizeof(Ty), sgmap.data(),
                            global * sizeof(cl_int4), odata.data(),
                            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        test_error_fail(error, "Running kernel second time failed");
+            test_error_fail(error, "Running kernel second time failed");
 
-        // Check the result
-        test_status status = Fns::chk(idata.data(), odata.data(), mapin.data(),
-                                      mapout.data(), sgmap.data(), test_params);
+            // Check the result
+            combined_status =
+                Fns::chk(idata.data(), odata.data(), mapin.data(),
+                         mapout.data(), sgmap.data(), test_params);
+        }
         // Detailed failure and skip messages should be logged by Fns::gen
         // and Fns::chk.
-        if (status == TEST_FAIL)
+        if (combined_status == TEST_PASS)
+        {
+            Fns::log_test(test_params, " passed");
+        }
+        else if (combined_status == TEST_FAIL)
         {
             test_fail("Data verification failed\n");
         }
-        return status;
+        return combined_status;
     }
 };
 
@@ -1643,18 +1650,9 @@ struct RunTestForType
             std::regex_replace(test_params_.get_kernel_source(function_name),
                                std::regex("\\%s"), function_name);
         std::string kernel_name = "test_" + function_name;
-        if (test_params_.all_work_item_masks.size() > 0)
-        {
-            error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
-                                     kernel_name.c_str(), source.c_str(),
-                                     test_params_);
-        }
-        else
-        {
-            error = test<T, U>::run(device_, context_, queue_, num_elements_,
-                                    kernel_name.c_str(), source.c_str(),
-                                    test_params_);
-        }
+        error =
+            test<T, U>::run(device_, context_, queue_, num_elements_,
+                            kernel_name.c_str(), source.c_str(), test_params_);
 
         // If we return TEST_SKIPPED_ITSELF here, then an entire suite may be
         // reported as having been skipped even if some tests within it
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
index 5ab4522268..02fc507b55 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -21,7 +21,7 @@
 namespace {
 
 std::string sub_group_non_uniform_arithmetic_source = R"(
-    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
         int gid = get_global_id(0);
         XY(xy,gid);
         uint subgroup_local_id = get_sub_group_local_id();
@@ -32,9 +32,9 @@ std::string sub_group_non_uniform_arithmetic_source = R"(
         } else if(subgroup_local_id < 64) {
             work_item_mask = work_item_mask_vector.y;
         } else if(subgroup_local_id < 96) {
-            work_item_mask = work_item_mask_vector.w;
-        } else if(subgroup_local_id < 128) {
             work_item_mask = work_item_mask_vector.z;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.w;
         }
         if (elect_work_item & work_item_mask){
             out[gid] = %s(in[gid]);
@@ -136,7 +136,7 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
 
     constexpr size_t global_work_size = 2000;
     constexpr size_t local_work_size = 200;
-    WorkGroupParams test_params(global_work_size, local_work_size, true);
+    WorkGroupParams test_params(global_work_size, local_work_size, 3);
     test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source);
     RunTestForType rft(device, context, queue, num_elements, test_params);
 
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
index 3f0985e26b..3be1ba307a 100644
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -202,7 +202,7 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
 };
 
 std::string sub_group_elect_source = R"(
-    __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
+    __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
         int gid = get_global_id(0);
         XY(xy,gid);
         uint subgroup_local_id = get_sub_group_local_id();
@@ -213,9 +213,9 @@ std::string sub_group_elect_source = R"(
         } else if(subgroup_local_id < 64) {
             work_item_mask = work_item_mask_vector.y;
         } else if(subgroup_local_id < 96) {
-            work_item_mask = work_item_mask_vector.w;
-        } else if(subgroup_local_id < 128) {
             work_item_mask = work_item_mask_vector.z;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.w;
         }
         if (elect_work_item & work_item_mask){
             out[gid] = sub_group_elect();
@@ -224,7 +224,7 @@ std::string sub_group_elect_source = R"(
 )";
 
 std::string sub_group_non_uniform_any_all_all_equal_source = R"(
-    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
+    __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
         int gid = get_global_id(0);
         XY(xy,gid);
         uint subgroup_local_id = get_sub_group_local_id();
@@ -235,9 +235,9 @@ std::string sub_group_non_uniform_any_all_all_equal_source = R"(
         } else if(subgroup_local_id < 64) {
             work_item_mask = work_item_mask_vector.y;
         } else if(subgroup_local_id < 96) {
-            work_item_mask = work_item_mask_vector.w;
-        } else if(subgroup_local_id < 128) {
             work_item_mask = work_item_mask_vector.z;
+        } else if(subgroup_local_id < 128) {
+            work_item_mask = work_item_mask_vector.w;
         }
         if (elect_work_item & work_item_mask){
                 out[gid] = %s(in[gid]);
@@ -267,7 +267,7 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
 
     constexpr size_t global_work_size = 170;
     constexpr size_t local_work_size = 64;
-    WorkGroupParams test_params(global_work_size, local_work_size, true);
+    WorkGroupParams test_params(global_work_size, local_work_size, 3);
     test_params.save_kernel_source(
         sub_group_non_uniform_any_all_all_equal_source);
     test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");

From 60471a520804fbd6611acd1c48f35549bb512deb Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Fri, 28 Jan 2022 09:15:44 +0000
Subject: [PATCH 156/158] Improve testing of sub_group_ballot (#1382)

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_common/harness/mt19937.cpp               |   2 +
 test_common/harness/mt19937.h                 |   3 +
 test_conformance/subgroups/subhelpers.h       |   6 +
 .../subgroups/test_subgroup_ballot.cpp        | 191 +++++++++++++-----
 4 files changed, 147 insertions(+), 55 deletions(-)

diff --git a/test_common/harness/mt19937.cpp b/test_common/harness/mt19937.cpp
index c32d9bac6a..f5665deb23 100644
--- a/test_common/harness/mt19937.cpp
+++ b/test_common/harness/mt19937.cpp
@@ -277,3 +277,5 @@ double genrand_res53(MTdata d)
     unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6;
     return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
 }
+
+bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); }
diff --git a/test_common/harness/mt19937.h b/test_common/harness/mt19937.h
index 35c84933f8..98eec84352 100644
--- a/test_common/harness/mt19937.h
+++ b/test_common/harness/mt19937.h
@@ -90,6 +90,9 @@ double genrand_res53(MTdata /*data*/);
 
 #ifdef __cplusplus
 
+/* generates a random boolean */
+bool genrand_bool(MTdata /*data*/);
+
 #include <cassert>
 
 struct MTdataHolder
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index aa4abc967e..153045d08c 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -34,6 +34,12 @@ extern MTdata gMTdata;
 typedef std::bitset<128> bs128;
 extern cl_half_rounding_mode g_rounding_mode;
 
+static bs128 cl_uint4_to_bs128(cl_uint4 v)
+{
+    return bs128(v.s0) | (bs128(v.s1) << 32) | (bs128(v.s2) << 64)
+        | (bs128(v.s3) << 96);
+}
+
 static cl_uint4 bs128_to_cl_uint4(bs128 v)
 {
     bs128 bs128_ffffffff = 0xffffffffU;
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
index 837988ea1f..4148707eba 100644
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -31,45 +31,93 @@ template <typename Ty> struct BALLOT
 
     static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
     {
-        // no work here
         int gws = test_params.global_workgroup_size;
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
         int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        wg_number = non_uniform_size ? wg_number + 1 : wg_number;
+        int last_subgroup_size = 0;
+
+        for (int wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            for (int sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                int current_sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+
+                for (int wi_id = 0; wi_id < current_sbs; wi_id++)
+                {
+                    cl_uint v;
+                    if (genrand_bool(gMTdata))
+                    {
+                        v = genrand_bool(gMTdata);
+                    }
+                    else if (genrand_bool(gMTdata))
+                    {
+                        v = 1U << ((genrand_int32(gMTdata) % 31) + 1);
+                    }
+                    else
+                    {
+                        v = genrand_int32(gMTdata);
+                    }
+                    cl_uint4 v4 = { v, 0, 0, 0 };
+                    t[wi_id + wg_offset] = v4;
+                }
+            }
+            // Now map into work group using map from device
+            for (int wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+            x += lws;
+            m += 4 * lws;
+        }
     }
 
     static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                            const WorkGroupParams &test_params)
     {
-        int wi_id, wg_id, sb_id;
         int gws = test_params.global_workgroup_size;
         int lws = test_params.local_workgroup_size;
         int sbs = test_params.subgroup_size;
         int sb_number = (lws + sbs - 1) / sbs;
-        int current_sbs = 0;
-        cl_uint expected_result, device_result;
         int non_uniform_size = gws % lws;
         int wg_number = gws / lws;
         wg_number = non_uniform_size ? wg_number + 1 : wg_number;
         int last_subgroup_size = 0;
 
-        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        for (int wg_id = 0; wg_id < wg_number; ++wg_id)
         { // for each work_group
             if (non_uniform_size && wg_id == wg_number - 1)
             {
                 set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
                                           last_subgroup_size);
             }
-
-            for (wi_id = 0; wi_id < lws; ++wi_id)
+            for (int wi_id = 0; wi_id < lws; ++wi_id)
             { // inside the work_group
-                // read device outputs for work_group
-                my[wi_id] = y[wi_id];
+                mx[wi_id] = x[wi_id]; // read host inputs for work_group
+                my[wi_id] = y[wi_id]; // read device outputs for work_group
             }
 
-            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            for (int sb_id = 0; sb_id < sb_number; ++sb_id)
             { // for each subgroup
                 int wg_offset = sb_id * sbs;
+                int current_sbs;
                 if (last_subgroup_size && sb_id == sb_number - 1)
                 {
                     current_sbs = last_subgroup_size;
@@ -78,25 +126,54 @@ template <typename Ty> struct BALLOT
                 {
                     current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
                 }
-                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+
+                bs128 expected_result_bs = 0;
+
+                std::set<int> active_work_items;
+                for (int wi_id = 0; wi_id < current_sbs; ++wi_id)
                 {
-                    device_result = my[wg_offset + wi_id];
-                    expected_result = 1;
-                    if (!compare(device_result, expected_result))
+                    if (test_params.work_items_mask.test(wi_id))
+                    {
+                        bool predicate = (mx[wg_offset + wi_id].s0 != 0);
+                        expected_result_bs |= (bs128(predicate) << wi_id);
+                        active_work_items.insert(wi_id);
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    continue;
+                }
+
+                cl_uint4 expected_result =
+                    bs128_to_cl_uint4(expected_result_bs);
+                for (const int &active_work_item : active_work_items)
+                {
+                    int wi_id = active_work_item;
+
+                    cl_uint4 device_result = my[wg_offset + wi_id];
+                    bs128 device_result_bs = cl_uint4_to_bs128(device_result);
+
+                    if (device_result_bs != expected_result_bs)
                     {
                         log_error(
                             "ERROR: sub_group_ballot mismatch for local id "
-                            "%d in sub group %d in group %d obtained %d, "
-                            "expected %d\n",
-                            wi_id, sb_id, wg_id, device_result,
-                            expected_result);
+                            "%d in sub group %d in group %d obtained {%d, %d, "
+                            "%d, %d}, expected {%d, %d, %d, %d}\n",
+                            wi_id, sb_id, wg_id, device_result.s0,
+                            device_result.s1, device_result.s2,
+                            device_result.s3, expected_result.s0,
+                            expected_result.s1, expected_result.s2,
+                            expected_result.s3);
                         return TEST_FAIL;
                     }
                 }
             }
+
+            x += lws;
             y += lws;
             m += 4 * lws;
         }
+
         return TEST_PASS;
     }
 };
@@ -724,27 +801,26 @@ __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type
 }
 )";
 std::string sub_group_ballot_source = R"(
-__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
-    uint4 full_ballot = sub_group_ballot(1);
-    uint divergence_mask;
-    uint4 partial_ballot;
+__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
     uint gid = get_global_id(0);
     XY(xy,gid);
-    if (get_sub_group_local_id() & 1) {
-        divergence_mask = 0xaaaaaaaa;
-        partial_ballot = sub_group_ballot(1);
-    } else {
-        divergence_mask = 0x55555555;
-        partial_ballot = sub_group_ballot(1);
+    uint subgroup_local_id = get_sub_group_local_id();
+    uint elect_work_item = 1 << (subgroup_local_id % 32);
+    uint work_item_mask;
+    if (subgroup_local_id < 32) {
+        work_item_mask = work_item_mask_vector.x;
+    } else if(subgroup_local_id < 64) {
+        work_item_mask = work_item_mask_vector.y;
+    } else if(subgroup_local_id < 96) {
+        work_item_mask = work_item_mask_vector.z;
+    } else if(subgroup_local_id < 128) {
+        work_item_mask = work_item_mask_vector.w;
     }
-     size_t lws = get_local_size(0);
-    uint4 masked_ballot = full_ballot;
-    masked_ballot.x &= divergence_mask;
-    masked_ballot.y &= divergence_mask;
-    masked_ballot.z &= divergence_mask;
-    masked_ballot.w &= divergence_mask;
-    out[gid] = all(masked_ballot == partial_ballot);
-
+    uint4 value = (uint4)(0, 0, 0, 0);
+    if (elect_work_item & work_item_mask) {
+        value = sub_group_ballot(in[gid].s0);
+    }
+    out[gid] = value;
 }
 )";
 std::string sub_group_inverse_ballot_source = R"(
@@ -952,42 +1028,47 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
     error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
         "get_sub_group_lt_mask");
 
-    // ballot functions
-    WorkGroupParams test_params_ballot(global_work_size, local_work_size);
-    test_params_ballot.save_kernel_source(
-        sub_group_ballot_bit_scan_find_source);
-    test_params_ballot.save_kernel_source(sub_group_ballot_source,
-                                          "sub_group_ballot");
-    test_params_ballot.save_kernel_source(sub_group_inverse_ballot_source,
-                                          "sub_group_inverse_ballot");
-    test_params_ballot.save_kernel_source(sub_group_ballot_bit_extract_source,
-                                          "sub_group_ballot_bit_extract");
+    // sub_group_ballot function
+    WorkGroupParams test_params_ballot(global_work_size, local_work_size, 3);
+    test_params_ballot.save_kernel_source(sub_group_ballot_source);
     RunTestForType rft_ballot(device, context, queue, num_elements,
                               test_params_ballot);
-    error |= rft_ballot.run_impl<cl_uint, BALLOT<cl_uint>>("sub_group_ballot");
     error |=
-        rft_ballot.run_impl<cl_uint4,
-                            BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
+        rft_ballot.run_impl<cl_uint4, BALLOT<cl_uint4>>("sub_group_ballot");
+
+    // ballot arithmetic functions
+    WorkGroupParams test_params_arith(global_work_size, local_work_size);
+    test_params_arith.save_kernel_source(sub_group_ballot_bit_scan_find_source);
+    test_params_arith.save_kernel_source(sub_group_inverse_ballot_source,
+                                         "sub_group_inverse_ballot");
+    test_params_arith.save_kernel_source(sub_group_ballot_bit_extract_source,
+                                         "sub_group_ballot_bit_extract");
+    RunTestForType rft_arith(device, context, queue, num_elements,
+                             test_params_arith);
+    error |=
+        rft_arith.run_impl<cl_uint4,
+                           BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
             "sub_group_inverse_ballot");
-    error |= rft_ballot.run_impl<
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
         "sub_group_ballot_bit_extract");
-    error |= rft_ballot.run_impl<
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
         "sub_group_ballot_bit_count");
-    error |= rft_ballot.run_impl<
+    error |= rft_arith.run_impl<
         cl_uint4,
         BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
         "sub_group_ballot_inclusive_scan");
-    error |= rft_ballot.run_impl<
+    error |= rft_arith.run_impl<
         cl_uint4,
         BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
         "sub_group_ballot_exclusive_scan");
-    error |= rft_ballot.run_impl<
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
         "sub_group_ballot_find_lsb");
-    error |= rft_ballot.run_impl<
+    error |= rft_arith.run_impl<
         cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
         "sub_group_ballot_find_msb");
+
     return error;
 }

From 6b14d408dc8cc0a05bca554e8b43d269fba179d0 Mon Sep 17 00:00:00 2001
From: Stuart Brady <stuart.brady@arm.com>
Date: Thu, 10 Feb 2022 06:24:33 +0000
Subject: [PATCH 157/158] Improve testing of kernel arg info in pipe_info test
 (#1326)

The test now checks that CL_KERNEL_ARG_INFO_NOT_AVAILABLE is returned
when calling clGetKernelArgInfo() with offline compilation modes.

The correct function name is printed if clGetKernelArgInfo() fails
when using online compilation (and not "clSetKernelArgInfo()").

When using online compilation, if the actual arg type is not as
expected, the actual arg type is now logged, and the return value
is now TEST_FAIL (-1) as per other failures (and not 1).

All other test pass/fail values used in the test now use TEST_PASS
and TEST_FAIL instead of 0 and -1 literals.

An unnecessary cast of pipe_kernel_code has been removed.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
---
 test_conformance/pipes/test_pipe_info.cpp | 40 ++++++++++++++---------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/test_conformance/pipes/test_pipe_info.cpp b/test_conformance/pipes/test_pipe_info.cpp
index 7543c6cd99..e7b486dbe9 100644
--- a/test_conformance/pipes/test_pipe_info.cpp
+++ b/test_conformance/pipes/test_pipe_info.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 #include "procs.h"
+#include "harness/parseParameters.h"
 
 const char* pipe_kernel_code = {
     "__kernel void pipe_kernel(__write_only pipe int out_pipe)\n"
@@ -39,8 +40,7 @@ int test_pipe_info( cl_device_id deviceID, cl_context context, cl_command_queue
 
     if (pipe_width != returnVal)
     {
-        log_error("Error in clGetPipeInfo() check of pipe packet size\n");
-        return -1;
+        test_fail("Error in clGetPipeInfo() check of pipe packet size\n");
     }
     else
     {
@@ -52,29 +52,37 @@ int test_pipe_info( cl_device_id deviceID, cl_context context, cl_command_queue
 
     if(pipe_depth != returnVal)
     {
-        log_error( "Error in clGetPipeInfo() check of pipe max packets\n" );
-        return -1;
+        test_fail("Error in clGetPipeInfo() check of pipe max packets\n");
     }
     else
     {
         log_info( " CL_PIPE_MAX_PACKETS passed.\n" );
     }
 
-    err = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, (const char**)&pipe_kernel_code, "pipe_kernel", "-cl-std=CL2.0 -cl-kernel-arg-info");
-    test_error_ret(err, " Error creating program", -1);
+    err = create_single_kernel_helper_with_build_options(
+        context, &program, &kernel, 1, &pipe_kernel_code, "pipe_kernel",
+        "-cl-std=CL2.0 -cl-kernel-arg-info");
+    test_error_fail(err, "Error creating program");
 
     cl_kernel_arg_type_qualifier arg_type_qualifier = 0;
-    cl_kernel_arg_type_qualifier expected_type_qualifier = CL_KERNEL_ARG_TYPE_PIPE;
-    err = clGetKernelArgInfo( kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER, sizeof(arg_type_qualifier), &arg_type_qualifier, NULL );
-    test_error_ret(err, " clSetKernelArgInfo failed", -1);
-    err = (arg_type_qualifier != expected_type_qualifier);
-
-    if(err)
+    err = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                             sizeof(arg_type_qualifier), &arg_type_qualifier,
+                             NULL);
+    if (gCompilationMode == kOnline)
     {
-        print_error(err, "ERROR: Bad type qualifier\n");
-        return -1;
+        test_error_fail(err, "clGetKernelArgInfo failed");
+        if (arg_type_qualifier != CL_KERNEL_ARG_TYPE_PIPE)
+        {
+            test_fail("ERROR: Incorrect type qualifier: %i\n",
+                      arg_type_qualifier);
+        }
+    }
+    else
+    {
+        test_failure_error_ret(err, CL_KERNEL_ARG_INFO_NOT_AVAILABLE,
+                               "clGetKernelArgInfo error not as expected",
+                               TEST_FAIL);
     }
 
-    return err;
-
+    return TEST_PASS;
 }

From 2d93b122c3078cd67a0528ad9e791dbcadaf03d6 Mon Sep 17 00:00:00 2001
From: Jim Lewis <j.lewis1@samsung.com>
Date: Tue, 22 Feb 2022 10:49:35 -0600
Subject: [PATCH 158/158] Sync submission_details with conformance doc v26
 (#1389)

Add "Patches" field
---
 test_conformance/submission_details_template.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test_conformance/submission_details_template.txt b/test_conformance/submission_details_template.txt
index 9d276a62e2..ff62483752 100644
--- a/test_conformance/submission_details_template.txt
+++ b/test_conformance/submission_details_template.txt
@@ -81,6 +81,12 @@ Platform Version:
 # 
 Tests version:
 
+# Commit SHAs (7-digit) of any cherry-picked patches subsequent to tagged
+# version. Any patches included must apply without conflicts to the tagged
+# version in the order listed.
+#
+Patches:
+
 # Implementations that support cl_khr_icd are required to use a loader to run
 # the tests and document the loader that was used.
 #